@@ -360,7 +360,7 @@ def get_household_matches(
360
360
pairs_writer .writerow (matching_pairs [i ])
361
361
print (f"[{ datetime .now ()} ] Wrote matching pairs to { pairs_path } " )
362
362
363
- five_percent = int (len (matching_pairs ) / 20 )
363
+ five_percent = max ( int (len (matching_pairs ) / 20 ), 1 )
364
364
pos_to_pairs = {}
365
365
# note: "for pair in matching_pairs:" had unexpectedly poor performance here
366
366
for i in range (len (matching_pairs )):
@@ -407,30 +407,42 @@ def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=
407
407
# start with an empty index we can append to
408
408
candidate_links = pd .MultiIndex .from_tuples ([], names = [0 , 1 ])
409
409
410
+ # only include lines with an address, since otherwise
411
+ # missing addresses will be considered a match ("" == "")
412
+ pii_lines_with_address = pii_lines [pii_lines .household_street_address != "" ]
413
+
414
+ if len (pii_lines_with_address ) == 0 :
415
+ # essentially just a null check
416
+ # don't bother with the rest if we have no addresses
417
+ # this should never happen
418
+ return candidate_links
419
+
410
420
# break up the dataframe into subframes,
411
421
# and iterate over every pair of subframes.
412
422
# we improve performance somewhat by only comparing looking forward,
413
423
# that is, only comparing a given set of rows
414
424
# against rows with higher indices.
415
- for subset_A in np .array_split (pii_lines , split_factor ):
425
+ for subset_A in np .array_split (pii_lines_with_address , split_factor ):
416
426
first_item_in_A = subset_A .index .min ()
427
+
417
428
# don't compare against earlier items
418
429
# Note: this assumes that the index is the row number
419
430
# (NOT the record_id/patid) and the df is sequential
420
431
# this is currently the case in households.py#parse_source_file()
421
- lines_to_compare = pii_lines [first_item_in_A :]
432
+ lines_to_compare = pii_lines_with_address [first_item_in_A :]
422
433
423
434
# pick a sub split factor to give us ~same size subset_A and subset_B.
424
435
# the idea is that there's some implicit overhead to splitting,
425
436
# so don't split more tha necessary
426
- sub_split_factor = int (len (lines_to_compare ) / len (subset_A ))
437
+ sub_split_factor = max ( int (len (lines_to_compare ) / len (subset_A )), 1 )
427
438
for subset_B in np .array_split (lines_to_compare , sub_split_factor ):
428
439
if debug :
429
440
print (
430
441
f"[{ datetime .now ()} ] Indexing rows "
431
442
f"[{ subset_A .index .min ()} ..{ subset_A .index .max ()} ]"
432
443
" against "
433
444
f"[{ subset_B .index .min ()} ..{ subset_B .index .max ()} ]"
445
+ f". { len (candidate_links )} candidates so far"
434
446
)
435
447
436
448
# note pairs_subset and candidate_links are MultiIndexes
@@ -452,13 +464,6 @@ def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=
452
464
453
465
gc .collect ()
454
466
455
- # rows with blank address match ("" == "") so drop those here
456
- # TODO: ideally we wouldn't compare blank address lines in the first place
457
- # but the indexing and splitting bits get complicated if we drop them earlier
458
- blank_addresses = pii_lines [pii_lines ["household_street_address" ] == "" ].index
459
- candidate_links = candidate_links .drop (blank_addresses , level = 0 , errors = "ignore" )
460
- candidate_links = candidate_links .drop (blank_addresses , level = 1 , errors = "ignore" )
461
-
462
467
if debug :
463
468
print (f"[{ datetime .now ()} ] Found { len (candidate_links )} candidate pairs" )
464
469
@@ -509,7 +514,7 @@ def get_matching_pairs(
509
514
matching_pairs = pd .MultiIndex .from_tuples ([], names = [0 , 1 ])
510
515
# we know that we could support len(subset_A) in memory above,
511
516
# so use the same amount here
512
- len_subset_A = int (len (pii_lines ) / split_factor )
517
+ len_subset_A = max ( int (len (pii_lines ) / split_factor ), 1 )
513
518
514
519
# note: np.array_split had unexpectedly poor performance here for very large indices
515
520
for i in range (0 , len (candidate_links ), len_subset_A ):
0 commit comments