13
13
from definitions import TIMESTAMP_FMT
14
14
15
15
MATCH_THRESHOLD = 0.85
16
- FN_WEIGHT = 0.2
17
- PHONE_WEIGHT = 0.15
18
- ADDR_WEIGHT = 0.35
19
- ZIP_WEIGHT = 0.3
16
+ FN_WEIGHT = 0.25
17
+ PHONE_WEIGHT = 0.2
18
+ ADDR_WEIGHT = 0.55
19
+ # ZIP_WEIGHT = 0.25
20
+ # zip is not used in weighting since all candidate pairs match on zip
21
+
22
+ # a separate address threshold so that pairs with medium-low scores across all fields
23
+ # don't wind up getting matched anyway
24
+ ADDR_THRESHOLD = 0.95
25
+ # using address_distance() below:
26
+ # "205 GARDEN ST" v "206 GARDEN ST" --> 0.8333
27
+ # "205 GARDEN ST" v "205 GAREDN ST" --> 0.98444
28
+ # "205 GARDEN STREET" v "205 GAREDN ST" --> 0.9666
29
+ # "205 GARDEN ST APT 5F" v "205 GARDEN ST APT 5J" --> 0.9472
30
+ # so 0.95 should give us a good balance of not linking all apartments together
31
+ # while still allowing some room for typos and variation
20
32
21
33
22
34
def addr_parse (addr ):
@@ -53,6 +65,23 @@ def address_distance(addr1, addr2):
53
65
score = 0
54
66
secondary_score = 0
55
67
68
+ a1 = addr1 ["household_street_address" ]
69
+ a2 = addr2 ["household_street_address" ]
70
+
71
+ if not a1 or not a2 :
72
+ # if either is blank they get a score of 0
73
+ # this matches textdistance.jaro_winkler("", x)
74
+ # but textdistance.jaro_winkler("", "") is normally 1
75
+ # without this, 2 missing addresses could be a "perfect match"
76
+ # which is not what we want
77
+ return 0
78
+
79
+ if a1 == a2 :
80
+ # if the strings are exactly identical,
81
+ # don't waste time with detailed comparisons
82
+ # this matches textdistance.jaro_winkler(x, x)
83
+ return 1
84
+
56
85
# Change weights based on existence of second level address
57
86
if (
58
87
not addr1 ["prefix" ]
@@ -213,15 +242,10 @@ def address_distance(addr1, addr2):
213
242
214
243
# See if simple string compare of all things combined
215
244
# with a 0.6 adjustment is better
216
- a1 = addr1 ["household_street_address" ]
217
- a2 = addr2 ["household_street_address" ]
218
- if a1 and a2 :
219
- score = max (
220
- score ,
221
- textdistance .jaro_winkler (a1 , a2 )
222
- * (weight_number + weight_street_name )
223
- * 0.6 ,
224
- ) + (secondary_score * weight_secondary )
245
+ score = max (
246
+ score ,
247
+ textdistance .jaro_winkler (a1 , a2 ) * (weight_number + weight_street_name ) * 0.6 ,
248
+ ) + (secondary_score * weight_secondary )
225
249
return score
226
250
227
251
@@ -271,7 +295,9 @@ def explode_address(row):
271
295
return parsed
272
296
273
297
274
- def get_household_matches (pii_lines , split_factor = 4 , debug = False , pairsfile = None ):
298
+ def get_household_matches (
299
+ pii_lines , split_factor = 4 , debug = False , exact_addresses = False , pairsfile = None
300
+ ):
275
301
if pairsfile :
276
302
if debug :
277
303
print (f"[{ datetime .now ()} ] Loading matching pairs file" )
@@ -283,28 +309,42 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
283
309
print (f"[{ datetime .now ()} ] Done loading matching pairs" )
284
310
285
311
else :
286
- # break out the address into number, street, suffix, etc,
287
- # so we can prefilter matches based on those
288
- addr_cols = pii_lines .apply (
289
- explode_address ,
290
- axis = "columns" ,
291
- result_type = "expand" ,
292
- )
293
- pii_lines_exploded = pd .concat ([pii_lines , addr_cols ], axis = "columns" )
312
+
313
+ if exact_addresses :
314
+ pii_lines_exploded = pii_lines
315
+ else :
316
+ # break out the address into number, street, suffix, etc,
317
+ # so we can prefilter matches based on those
318
+ addr_cols = pii_lines .apply (
319
+ explode_address ,
320
+ axis = "columns" ,
321
+ result_type = "expand" ,
322
+ )
323
+ pii_lines_exploded = pd .concat ([pii_lines , addr_cols ], axis = "columns" )
294
324
295
325
if debug :
296
326
print (f"[{ datetime .now ()} ] Done pre-processing PII file" )
297
327
298
- candidate_links = get_candidate_links (pii_lines_exploded , split_factor , debug )
299
- gc .collect ()
300
-
301
- matching_pairs = get_matching_pairs (
302
- pii_lines_exploded , candidate_links , split_factor , debug
328
+ candidate_links = get_candidate_links (
329
+ pii_lines_exploded , split_factor , exact_addresses , debug
303
330
)
304
- del candidate_links
305
- del pii_lines_exploded
306
331
gc .collect ()
307
332
333
+ if exact_addresses :
334
+ # the candidate links are already all the pairs with matching [address, zip]
335
+ matching_pairs = candidate_links
336
+ else :
337
+ matching_pairs = get_matching_pairs (
338
+ pii_lines_exploded ,
339
+ candidate_links ,
340
+ split_factor ,
341
+ exact_addresses ,
342
+ debug ,
343
+ )
344
+ del pii_lines_exploded
345
+ del candidate_links
346
+ gc .collect ()
347
+
308
348
if debug :
309
349
timestamp = datetime .now ().strftime (TIMESTAMP_FMT )
310
350
pairs_path = Path ("temp-data" ) / f"households_pairs-{ timestamp } .csv"
@@ -347,21 +387,25 @@ def get_household_matches(pii_lines, split_factor=4, debug=False, pairsfile=None
347
387
return pos_to_pairs
348
388
349
389
350
- def get_candidate_links (pii_lines , split_factor = 4 , debug = False ):
390
+ def get_candidate_links (pii_lines , split_factor = 4 , exact_addresses = False , debug = False ):
351
391
# indexing step defines the pairs of records for comparison
352
392
# indexer.full() does a full n^2 comparison, but we can do better
353
393
indexer = recordlinkage .Index ()
354
- # use two block indexes to reduce the number of candidates
394
+ # use block indexes to reduce the number of candidates
355
395
# while still retaining enough candidates to identify real households.
356
396
# a block only on zip could work, but seems to run into memory issues
357
397
# note sortedneighborhood on zip probably doesn't make sense
358
398
# (zip codes in a geographic area will be too similar)
359
399
# but if data is dirty then blocks may discard typos
360
400
361
- indexer .block (["household_zip" , "street" , "number" ])
362
- indexer .block (["household_zip" , "family_name" ])
401
+ if exact_addresses :
402
+ indexer .block (["household_zip" , "household_street_address" ])
403
+ else :
404
+ indexer .block (["household_zip" , "street" , "number" ])
405
+ indexer .block (["household_zip" , "family_name" ])
363
406
364
- candidate_links = None
407
+ # start with an empty index we can append to
408
+ candidate_links = pd .MultiIndex .from_tuples ([], names = [0 , 1 ])
365
409
366
410
# break up the dataframe into subframes,
367
411
# and iterate over every pair of subframes.
@@ -404,20 +448,26 @@ def get_candidate_links(pii_lines, split_factor=4, debug=False):
404
448
pairs_subset = pairs_subset [pairs_subset [0 ] < pairs_subset [1 ]]
405
449
pairs_subset = pd .MultiIndex .from_frame (pairs_subset )
406
450
407
- if candidate_links is None :
408
- candidate_links = pairs_subset
409
- else :
410
- candidate_links = candidate_links .append (pairs_subset )
451
+ candidate_links = candidate_links .append (pairs_subset )
411
452
412
453
gc .collect ()
413
454
455
+ # rows with blank address match ("" == "") so drop those here
456
+ # TODO: ideally we wouldn't compare blank address lines in the first place
457
+ # but the indexing and splitting bits get complicated if we drop them earlier
458
+ blank_addresses = pii_lines [pii_lines ["household_street_address" ] == "" ].index
459
+ candidate_links = candidate_links .drop (blank_addresses , level = 0 , errors = "ignore" )
460
+ candidate_links = candidate_links .drop (blank_addresses , level = 1 , errors = "ignore" )
461
+
414
462
if debug :
415
463
print (f"[{ datetime .now ()} ] Found { len (candidate_links )} candidate pairs" )
416
464
417
465
return candidate_links
418
466
419
467
420
- def get_matching_pairs (pii_lines , candidate_links , split_factor , debug ):
468
+ def get_matching_pairs (
469
+ pii_lines , candidate_links , split_factor , exact_addresses , debug
470
+ ):
421
471
# Comparison step performs the defined comparison algorithms
422
472
# against the candidate pairs
423
473
compare_cl = recordlinkage .Compare ()
@@ -428,24 +478,35 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
428
478
compare_cl .string (
429
479
"phone_number" , "phone_number" , method = "jarowinkler" , label = "phone_number"
430
480
)
431
- compare_cl .add (
432
- AddressComparison (
433
- "exploded_address" ,
434
- "exploded_address" ,
481
+ if exact_addresses :
482
+ compare_cl .string (
483
+ "household_street_address" ,
484
+ "household_street_address" ,
485
+ method = "jarowinkler" ,
435
486
label = "household_street_address" ,
436
487
)
437
- )
438
- compare_cl .string (
439
- "household_zip" , "household_zip" , method = "levenshtein" , label = "household_zip"
440
- )
488
+ else :
489
+ compare_cl .add (
490
+ AddressComparison (
491
+ "exploded_address" ,
492
+ "exploded_address" ,
493
+ label = "household_street_address" ,
494
+ )
495
+ )
496
+
497
+ # NOTE: zip code is DISABLED because our indexes block on zip code
498
+ # compare_cl.string(
499
+ # "household_zip", "household_zip", method="levenshtein", label="household_zip"
500
+ # )
441
501
# note: hamming distance is not implemented in this library,
442
502
# but levenshtein is. the two metrics are likely similar enough
443
503
# that it's not worth implementing hamming again
444
504
445
505
if debug :
446
506
print (f"[{ datetime .now ()} ] Starting detailed comparison of indexed pairs" )
447
507
448
- matching_pairs = None
508
+ # start with an empty index we can append to
509
+ matching_pairs = pd .MultiIndex .from_tuples ([], names = [0 , 1 ])
449
510
# we know that we could support len(subset_A) in memory above,
450
511
# so use the same amount here
451
512
len_subset_A = int (len (pii_lines ) / split_factor )
@@ -470,18 +531,18 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
470
531
471
532
features = compare_cl .compute (subset_links , relevant_pii_lines )
472
533
534
+ # first filter by address similarity
535
+ features = features [features ["household_street_address" ] > ADDR_THRESHOLD ]
536
+
473
537
features ["family_name" ] *= FN_WEIGHT
474
538
features ["phone_number" ] *= PHONE_WEIGHT
475
539
features ["household_street_address" ] *= ADDR_WEIGHT
476
- features ["household_zip" ] *= ZIP_WEIGHT
540
+ # features["household_zip"] *= ZIP_WEIGHT
477
541
478
542
# filter the matches down based on the cumulative score
479
543
matches = features [features .sum (axis = 1 ) > MATCH_THRESHOLD ]
480
544
481
- if matching_pairs is None :
482
- matching_pairs = matches .index
483
- else :
484
- matching_pairs = matching_pairs .append (matches .index )
545
+ matching_pairs = matching_pairs .append (matches .index )
485
546
# matching pairs are bi-directional and not duplicated,
486
547
# ex if (1,9) is in the list then (9,1) won't be
487
548
@@ -492,9 +553,6 @@ def get_matching_pairs(pii_lines, candidate_links, split_factor, debug):
492
553
del matches
493
554
gc .collect ()
494
555
495
- # drop exploded address because it's not used past this point
496
- pii_lines .drop (columns = ["exploded_address" ], inplace = True )
497
-
498
556
if debug :
499
557
print (f"[{ datetime .now ()} ] Found { len (matching_pairs )} matching pairs" )
500
558
0 commit comments