Skip to content

Commit 38bd3e9

Browse files
authored
Merge pull request #59 from mitre/even_more_fixes
Additional fixes
2 parents 6ec0195 + 1be2469 commit 38bd3e9

File tree

4 files changed

+30
-18
lines changed

4 files changed

+30
-18
lines changed

data_analysis.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import time
55
from datetime import datetime
66

7+
import numpy as np
78
import pandas as pd
89

910
from utils.data_reader import (
@@ -155,8 +156,8 @@ def top_N(series, N=0, lower_limit=1):
155156

156157

157158
def summary(series):
158-
# 1. count the number of missing (null) entries
159-
missing = series.isna().sum()
159+
# 1. count the number of missing (null or blank string) entries
160+
missing = series.replace(r"^\s*$", np.nan, regex=True).isna().sum()
160161

161162
# 2. basic descriptive statistics on the length of the values
162163
length = series.str.len().describe().to_dict()

households/matching.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ def get_household_matches(
360360
pairs_writer.writerow(matching_pairs[i])
361361
print(f"[{datetime.now()}] Wrote matching pairs to {pairs_path}")
362362

363-
five_percent = int(len(matching_pairs) / 20)
363+
five_percent = max(int(len(matching_pairs) / 20), 1)
364364
pos_to_pairs = {}
365365
# note: "for pair in matching_pairs:" had unexpectedly poor performance here
366366
for i in range(len(matching_pairs)):
@@ -407,30 +407,42 @@ def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=
407407
# start with an empty index we can append to
408408
candidate_links = pd.MultiIndex.from_tuples([], names=[0, 1])
409409

410+
# only include lines with an address, since otherwise
411+
# missing addresses will be considered a match ("" == "")
412+
pii_lines_with_address = pii_lines[pii_lines.household_street_address != ""]
413+
414+
if len(pii_lines_with_address) == 0:
415+
# essentially just a null check
416+
# don't bother with the rest if we have no addresses
417+
# this should never happen
418+
return candidate_links
419+
410420
# break up the dataframe into subframes,
411421
# and iterate over every pair of subframes.
412422
# we improve performance somewhat by only comparing looking forward,
413423
# that is, only comparing a given set of rows
414424
# against rows with higher indices.
415-
for subset_A in np.array_split(pii_lines, split_factor):
425+
for subset_A in np.array_split(pii_lines_with_address, split_factor):
416426
first_item_in_A = subset_A.index.min()
427+
417428
# don't compare against earlier items
418429
# Note: this assumes that the index is the row number
419430
# (NOT the record_id/patid) and the df is sequential
420431
# this is currently the case in households.py#parse_source_file()
421-
lines_to_compare = pii_lines[first_item_in_A:]
432+
lines_to_compare = pii_lines_with_address[first_item_in_A:]
422433

423434
# pick a sub split factor to give us ~same size subset_A and subset_B.
424435
# the idea is that there's some implicit overhead to splitting,
425436
# so don't split more tha necessary
426-
sub_split_factor = int(len(lines_to_compare) / len(subset_A))
437+
sub_split_factor = max(int(len(lines_to_compare) / len(subset_A)), 1)
427438
for subset_B in np.array_split(lines_to_compare, sub_split_factor):
428439
if debug:
429440
print(
430441
f"[{datetime.now()}] Indexing rows "
431442
f"[{subset_A.index.min()}..{subset_A.index.max()}]"
432443
" against "
433444
f"[{subset_B.index.min()}..{subset_B.index.max()}]"
445+
f". {len(candidate_links)} candidates so far"
434446
)
435447

436448
# note pairs_subset and candidate_links are MultiIndexes
@@ -452,13 +464,6 @@ def get_candidate_links(pii_lines, split_factor=4, exact_addresses=False, debug=
452464

453465
gc.collect()
454466

455-
# rows with blank address match ("" == "") so drop those here
456-
# TODO: ideally we wouldn't compare blank address lines in the first place
457-
# but the indexing and splitting bits get complicated if we drop them earlier
458-
blank_addresses = pii_lines[pii_lines["household_street_address"] == ""].index
459-
candidate_links = candidate_links.drop(blank_addresses, level=0, errors="ignore")
460-
candidate_links = candidate_links.drop(blank_addresses, level=1, errors="ignore")
461-
462467
if debug:
463468
print(f"[{datetime.now()}] Found {len(candidate_links)} candidate pairs")
464469

@@ -509,7 +514,7 @@ def get_matching_pairs(
509514
matching_pairs = pd.MultiIndex.from_tuples([], names=[0, 1])
510515
# we know that we could support len(subset_A) in memory above,
511516
# so use the same amount here
512-
len_subset_A = int(len(pii_lines) / split_factor)
517+
len_subset_A = max(int(len(pii_lines) / split_factor), 1)
513518

514519
# note: np.array_split had unexpectedly poor performance here for very large indices
515520
for i in range(0, len(candidate_links), len_subset_A):

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ clkhash>=0.16.0
66
psycopg2>=2.8.3
77
anonlink-client==0.1.5
88
ijson>=3.1.2
9-
textdistance[extras]>=4.5.0
9+
textdistance>=4.5.0
1010
usaddress>=0.5.10
1111
pylint>=2.4.2
1212
tqdm>=4.36.1

utils/data_reader.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -124,14 +124,18 @@ def map_key(row, key):
124124
return row_key
125125

126126

127-
def empty_str_from_none(string):
128-
if string is None:
127+
def empty_str_from_none(obj):
128+
if obj is None:
129129
return ""
130+
elif isinstance(obj, pd.Series):
131+
return obj.fillna("")
130132
else:
131-
return string
133+
return obj
132134

133135

134136
def case_insensitive_lookup(row, key, version):
137+
# IMPORTANT: this function gets called from extract.py and data_analysis.py
138+
# with different types for `row`
135139
data_key = DATA_DICTIONARY[version][key]
136140
if isinstance(data_key, list):
137141
first_key = map_key(row, data_key[0])
@@ -141,6 +145,8 @@ def case_insensitive_lookup(row, key, version):
141145
if mapped_subkey:
142146
subdata = empty_str_from_none(row[mapped_subkey])
143147
data = data + " " + subdata
148+
if isinstance(data, pd.Series):
149+
data.name = key
144150

145151
return data
146152

0 commit comments

Comments
 (0)