Skip to content

Commit

Permalink
feat(upload-notes): add ability to tag philter spans, not redact them
Browse files Browse the repository at this point in the history
This commit deprecates --no-philter in favor of --philter=disable
and adds --philter=label (and the default --philter=redact).

When --philter=label is requested, we send a prediction layer to Label
Studio that highlights all the detected PHI spans for you.

This is helpful when doing manual de-identification, since philter can
do some of the grunt work for you.
  • Loading branch information
mikix committed Mar 4, 2024
1 parent 0d0db81 commit dd9f3cb
Show file tree
Hide file tree
Showing 6 changed files with 206 additions and 58 deletions.
12 changes: 11 additions & 1 deletion cumulus_etl/deid/philter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,22 @@ def __init__(self):
filter_config = os.path.join(os.path.dirname(__file__), "philter-config.toml")
self.filters = philter_lite.load_filters(filter_config)

def detect_phi(self, text: str) -> philter_lite.CoordinateMap:
"""
Find PHI spans using Philter.
:param text: the text to scrub
:returns: a map of where the spans to scrub are
"""
include_map, _, _ = philter_lite.detect_phi(text, self.filters)
return include_map

def scrub_text(self, text: str) -> str:
"""
Scrub text of PHI using Philter.
:param text: the text to scrub
:returns: the scrubbed text, with PHI replaced by asterisks ("*")
"""
include_map, _, _ = philter_lite.detect_phi(text, self.filters)
include_map = self.detect_phi(text)
return philter_lite.transform_text_asterisk(text, include_map)
41 changes: 31 additions & 10 deletions cumulus_etl/upload_notes/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from cumulus_etl.upload_notes import downloader, selector
from cumulus_etl.upload_notes.labelstudio import LabelStudioClient, LabelStudioNote

PHILTER_DISABLE = "disable"
PHILTER_REDACT = "redact"
PHILTER_LABEL = "label"


def init_checks(args: argparse.Namespace):
"""Do any external service checks necessary at the start"""
Expand Down Expand Up @@ -61,8 +65,9 @@ async def read_notes_from_ndjson(
for docref in common.read_resource_ndjson(store.Root(dirname), "DocumentReference"):
encounter_refs = docref.get("context", {}).get("encounter", [])
if not encounter_refs:
# If a note doesn't have an encounter - we're in big trouble
raise SystemExit(f"DocumentReference {docref['id']} is missing encounter information.")
# If a note doesn't have an encounter - we can't group it with other docs
print(f"Skipping DocumentReference {docref['id']} as it lacks a linked encounter.")
continue

encounter_ids.append(fhir.unref_resource(encounter_refs[0])[1]) # just use first encounter
docrefs.append(docref)
Expand Down Expand Up @@ -117,17 +122,20 @@ async def run_nlp(notes: Collection[LabelStudioNote], args: argparse.Namespace)
client=http_client,
model=nlp.TransformerModel.NEGATION,
)
note.matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]
note.ctakes_matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]


def philter_notes(notes: Collection[LabelStudioNote], args: argparse.Namespace) -> None:
if not args.philter:
if args.philter == PHILTER_DISABLE:
return

common.print_header("Running philter...")
philter = deid.Philter()
for note in notes:
note.text = philter.scrub_text(note.text)
if args.philter == PHILTER_LABEL:
note.philter_map = philter.detect_phi(note.text).get_complement(note.text)
else:
note.text = philter.scrub_text(note.text)


def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelStudioNote]:
Expand All @@ -146,7 +154,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
# Group up the text into one big note
for enc_id, enc_notes in by_encounter_id.items():
grouped_text = ""
grouped_matches = []
grouped_ctakes_matches = []
grouped_philter_map = {}
grouped_doc_mappings = {}
grouped_doc_spans = {}

Expand All @@ -164,10 +173,13 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
offset_doc_spans = {k: (v[0] + offset, v[1] + offset) for k, v in note.doc_spans.items()}
grouped_doc_spans.update(offset_doc_spans)

for match in note.matches:
for match in note.ctakes_matches:
match.begin += offset
match.end += offset
grouped_matches.append(match)
grouped_ctakes_matches.append(match)

for start, stop in note.philter_map.items():
grouped_philter_map[start + offset] = stop + offset

grouped_notes.append(
LabelStudioNote(
Expand All @@ -176,7 +188,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
text=grouped_text,
doc_mappings=grouped_doc_mappings,
doc_spans=grouped_doc_spans,
matches=grouped_matches,
ctakes_matches=grouped_ctakes_matches,
philter_map=grouped_philter_map,
)
)

Expand Down Expand Up @@ -208,8 +221,16 @@ def define_upload_notes_parser(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--export-to", metavar="PATH", help="Where to put exported documents (default is to delete after use)"
)

parser.add_argument(
"--philter",
choices=[PHILTER_DISABLE, PHILTER_REDACT, PHILTER_LABEL],
default=PHILTER_REDACT,
help="Whether to use philter to redact/tag PHI",
)
# Old, simpler version of the above (feel free to remove after May 2024)
parser.add_argument(
"--no-philter", action="store_false", dest="philter", default=True, help="Don’t run philter on notes"
"--no-philter", action="store_const", const=PHILTER_DISABLE, dest="philter", help=argparse.SUPPRESS
)

cli_utils.add_aws(parser)
Expand Down
90 changes: 72 additions & 18 deletions cumulus_etl/upload_notes/labelstudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ class LabelStudioNote:
doc_spans: dict[str, tuple[int, int]] = dataclasses.field(default_factory=dict)

# Matches found by cTAKES
matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)
ctakes_matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)

# Matches found by Philter
philter_map: dict[int, int] = dataclasses.field(default_factory=dict)


class LabelStudioClient:
Expand Down Expand Up @@ -99,29 +102,88 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict:
"docref_mappings": note.doc_mappings,
"docref_spans": {k: list(v) for k, v in note.doc_spans.items()}, # json doesn't natively have tuples
},
"predictions": [],
}

self._format_prediction(task, note)
# Initialize any used labels in case we have a dynamic label config.
# Label Studio needs to see *something* here
self._update_used_labels(task, [])

self._format_ctakes_predictions(task, note)
self._format_philter_predictions(task, note)

return task

def _format_prediction(self, task: dict, note: LabelStudioNote) -> None:
def _format_ctakes_predictions(self, task: dict, note: LabelStudioNote) -> None:
if not note.ctakes_matches:
return

prediction = {
"model_version": "Cumulus", # TODO: do we want more specificity here?
"model_version": "Cumulus cTAKES",
}

used_labels = set()
results = []
count = 0
for match in note.matches:
for match in note.ctakes_matches:
matched_labels = {self._cui_labels.get(concept.cui) for concept in match.conceptAttributes}
matched_labels.discard(None) # drop the result of a concept not being in our bsv label set
if matched_labels:
results.append(self._format_match(count, match, matched_labels))
results.append(self._format_ctakes_match(count, match, matched_labels))
used_labels.update(matched_labels)
count += 1
prediction["result"] = results
task["predictions"].append(prediction)

self._update_used_labels(task, used_labels)

def _format_ctakes_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict:
return {
"id": f"ctakes{count}",
"from_name": self._labels_name,
"to_name": self._labels_config["to_name"][0],
"type": "labels",
"value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)},
}

def _format_philter_predictions(self, task: dict, note: LabelStudioNote) -> None:
"""
Adds a predication layer with philter spans.
Note that this does *not* update the running list of used labels.
This sets a "secret" / non-human-oriented label of "_philter".
Label Studio will still highlight the spans, and this way we won't
conflict with any existing labels.
"""
if not note.philter_map:
return

prediction = {
"model_version": "Cumulus Philter",
}

results = []
count = 0
for start, stop in sorted(note.philter_map.items()):
results.append(self._format_philter_span(count, start, stop, note))
count += 1
prediction["result"] = results

task["predictions"].append(prediction)

def _format_philter_span(self, count: int, start: int, end: int, note: LabelStudioNote) -> dict:
text = note.text[start:end]
return {
"id": f"philter{count}",
"from_name": self._labels_name,
"to_name": self._labels_config["to_name"][0],
"type": "labels",
# We hardcode the label "_philter" - Label Studio will still highlight unknown labels,
# and this is unlikely to collide with existing labels.
"value": {"start": start, "end": end, "score": 1.0, "text": text, "labels": ["_philter"]},
}

def _update_used_labels(self, task: dict, used_labels: Iterable[str]) -> None:
if self._labels_config.get("dynamic_labels"):
# This path supports configs like <Labels name="label" toName="text" value="$label"/> where the labels
# can be dynamically set by us.
Expand All @@ -130,15 +192,7 @@ def _format_prediction(self, task: dict, note: LabelStudioNote) -> None:
# actually kept in the config, so we have to make some assumptions about how the user set up their project.
#
# The rule that Cumulus uses is that the value= variable must equal the name= of the <Labels> element.
task["data"][self._labels_name] = [{"value": x} for x in sorted(used_labels)]

task["predictions"] = [prediction]

def _format_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict:
return {
"id": f"match{count}",
"from_name": self._labels_name,
"to_name": self._labels_config["to_name"][0],
"type": "labels",
"value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)},
}
existing_labels = task["data"].get(self._labels_name, [])
existing_labels = {d["value"] for d in existing_labels}
existing_labels.update(used_labels)
task["data"][self._labels_name] = [{"value": x} for x in sorted(existing_labels)]
2 changes: 1 addition & 1 deletion docs/chart-review.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ But you may get better results by adding extra terms and variations in your symp
## Disabling Features

You may not need NLP or `philter` processing.
Simply pass `--no-nlp` or `--no-philter` and those steps will be skipped.
Simply pass `--no-nlp` or `--philter=disable` and those steps will be skipped.

## Label Studio

Expand Down
43 changes: 32 additions & 11 deletions tests/upload_notes/test_upload_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ async def run_upload_notes(
anon_docrefs=None,
docrefs=None,
nlp=True,
philter=True,
philter=None,
no_philter=None,
overwrite=False,
) -> None:
args = [
Expand All @@ -98,7 +99,9 @@ async def run_upload_notes(
args += ["--docrefs", docrefs]
if not nlp:
args += ["--no-nlp"]
if not philter:
if philter:
args += ["--philter", philter]
if no_philter:
args += ["--no-philter"]
if overwrite:
args += ["--overwrite"]
Expand Down Expand Up @@ -314,7 +317,7 @@ async def test_successful_push_to_label_studio(self):
],
"type": "SignSymptomMention",
},
tasks[0].matches[0].as_json(),
tasks[0].ctakes_matches[0].as_json(),
)

@ddt.data(True, False)
Expand All @@ -329,27 +332,45 @@ async def test_disabled_nlp(self):
tasks = self.ls_client.push_tasks.call_args[0][0]
self.assertGreater(len(tasks), 0)
for task in tasks:
self.assertEqual([], task.matches)

@ddt.data(True, False)
async def test_philter(self, run_philter):
self.assertEqual([], task.ctakes_matches)

@ddt.data(
({}, True), # default args
({"philter": "redact"}, True),
({"philter": "disable"}, False),
({"no_philter": True}, False),
)
@ddt.unpack
async def test_philter_redact(self, upload_args, expect_redacted):
notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes):
await self.run_upload_notes(philter=run_philter)
await self.run_upload_notes(**upload_args)

tasks = self.ls_client.push_tasks.call_args[0][0]
self.assertEqual(1, len(tasks))
task = tasks[0]

# Regardless of philter, we keep the original cTAKES match text
self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.matches})
self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.ctakes_matches})

if run_philter:
if expect_redacted:
expected_text = "**** ***** called on 10/13/2010" # we don't philter dates
else:
expected_text = "John Smith called on 10/13/2010"
self.assertEqual(self.wrap_note("My Title", expected_text), task.text)

async def test_philter_label(self):
notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes):
await self.run_upload_notes(philter="label")

tasks = self.ls_client.push_tasks.call_args[0][0]
self.assertEqual(1, len(tasks))
task = tasks[0]

# High span numbers because we insert some header text
self.assertEqual({93: 97, 98: 103}, task.philter_map)

@respx.mock(assert_all_mocked=False)
async def test_combined_encounter_offsets(self, respx_mock):
# use server notes just for ease of making fake ones
Expand Down Expand Up @@ -388,5 +409,5 @@ async def test_combined_encounter_offsets(self, respx_mock):
self.assertEqual("What's", note.text[match2a[0] : match2a[1]])
self.assertEqual("up", note.text[match2b[0] : match2b[1]])
self.assertEqual("doc?", note.text[match2c[0] : match2c[1]])
spans = {x.span().key() for x in note.matches}
spans = {x.span().key() for x in note.ctakes_matches}
self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans)
Loading

0 comments on commit dd9f3cb

Please sign in to comment.