Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(upload-notes): add ability to tag philter spans, not redact them #297

Merged
merged 2 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion cumulus_etl/deid/philter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,22 @@ def __init__(self):
filter_config = os.path.join(os.path.dirname(__file__), "philter-config.toml")
self.filters = philter_lite.load_filters(filter_config)

def detect_phi(self, text: str) -> philter_lite.CoordinateMap:
"""
Find PHI spans using Philter.

:param text: the text to scrub
:returns: a map of where the spans to scrub are
"""
include_map, _, _ = philter_lite.detect_phi(text, self.filters)
return include_map

def scrub_text(self, text: str) -> str:
"""
Scrub text of PHI using Philter.

:param text: the text to scrub
:returns: the scrubbed text, with PHI replaced by asterisks ("*")
"""
include_map, _, _ = philter_lite.detect_phi(text, self.filters)
include_map = self.detect_phi(text)
return philter_lite.transform_text_asterisk(text, include_map)
41 changes: 31 additions & 10 deletions cumulus_etl/upload_notes/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
from cumulus_etl.upload_notes import downloader, selector
from cumulus_etl.upload_notes.labelstudio import LabelStudioClient, LabelStudioNote

PHILTER_DISABLE = "disable"
PHILTER_REDACT = "redact"
PHILTER_LABEL = "label"


def init_checks(args: argparse.Namespace):
"""Do any external service checks necessary at the start"""
Expand Down Expand Up @@ -61,8 +65,9 @@ async def read_notes_from_ndjson(
for docref in common.read_resource_ndjson(store.Root(dirname), "DocumentReference"):
encounter_refs = docref.get("context", {}).get("encounter", [])
if not encounter_refs:
# If a note doesn't have an encounter - we're in big trouble
raise SystemExit(f"DocumentReference {docref['id']} is missing encounter information.")
# If a note doesn't have an encounter - we can't group it with other docs
print(f"Skipping DocumentReference {docref['id']} as it lacks a linked encounter.")
continue

encounter_ids.append(fhir.unref_resource(encounter_refs[0])[1]) # just use first encounter
docrefs.append(docref)
Expand Down Expand Up @@ -117,17 +122,20 @@ async def run_nlp(notes: Collection[LabelStudioNote], args: argparse.Namespace)
client=http_client,
model=nlp.TransformerModel.NEGATION,
)
note.matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]
note.ctakes_matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]


def philter_notes(notes: Collection[LabelStudioNote], args: argparse.Namespace) -> None:
if not args.philter:
if args.philter == PHILTER_DISABLE:
return

common.print_header("Running philter...")
philter = deid.Philter()
for note in notes:
note.text = philter.scrub_text(note.text)
if args.philter == PHILTER_LABEL:
note.philter_map = philter.detect_phi(note.text).get_complement(note.text)
else:
note.text = philter.scrub_text(note.text)


def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelStudioNote]:
Expand All @@ -146,7 +154,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
# Group up the text into one big note
for enc_id, enc_notes in by_encounter_id.items():
grouped_text = ""
grouped_matches = []
grouped_ctakes_matches = []
grouped_philter_map = {}
grouped_doc_mappings = {}
grouped_doc_spans = {}

Expand All @@ -164,10 +173,13 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
offset_doc_spans = {k: (v[0] + offset, v[1] + offset) for k, v in note.doc_spans.items()}
grouped_doc_spans.update(offset_doc_spans)

for match in note.matches:
for match in note.ctakes_matches:
match.begin += offset
match.end += offset
grouped_matches.append(match)
grouped_ctakes_matches.append(match)

for start, stop in note.philter_map.items():
grouped_philter_map[start + offset] = stop + offset

grouped_notes.append(
LabelStudioNote(
Expand All @@ -176,7 +188,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
text=grouped_text,
doc_mappings=grouped_doc_mappings,
doc_spans=grouped_doc_spans,
matches=grouped_matches,
ctakes_matches=grouped_ctakes_matches,
philter_map=grouped_philter_map,
)
)

Expand Down Expand Up @@ -208,8 +221,16 @@ def define_upload_notes_parser(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--export-to", metavar="PATH", help="Where to put exported documents (default is to delete after use)"
)

parser.add_argument(
"--philter",
choices=[PHILTER_DISABLE, PHILTER_REDACT, PHILTER_LABEL],
default=PHILTER_REDACT,
help="Whether to use philter to redact/tag PHI",
)
# Old, simpler version of the above (feel free to remove after May 2024)
parser.add_argument(
"--no-philter", action="store_false", dest="philter", default=True, help="Don’t run philter on notes"
"--no-philter", action="store_const", const=PHILTER_DISABLE, dest="philter", help=argparse.SUPPRESS
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

very kind of you, but i think you could make a breaking change now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eh, I like to give some amount of gracetime, even to ourselves and our own muscle memory. But I've added a note that the arg can be removed after May 2024, to help avoid it staying around forever.

)

cli_utils.add_aws(parser)
Expand Down
90 changes: 72 additions & 18 deletions cumulus_etl/upload_notes/labelstudio.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@ class LabelStudioNote:
doc_spans: dict[str, tuple[int, int]] = dataclasses.field(default_factory=dict)

# Matches found by cTAKES
matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)
ctakes_matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)

# Matches found by Philter
philter_map: dict[int, int] = dataclasses.field(default_factory=dict)


class LabelStudioClient:
Expand Down Expand Up @@ -99,29 +102,88 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict:
"docref_mappings": note.doc_mappings,
"docref_spans": {k: list(v) for k, v in note.doc_spans.items()}, # json doesn't natively have tuples
},
"predictions": [],
}

self._format_prediction(task, note)
# Initialize any used labels in case we have a dynamic label config.
# Label Studio needs to see *something* here
self._update_used_labels(task, [])

self._format_ctakes_predictions(task, note)
self._format_philter_predictions(task, note)

return task

def _format_prediction(self, task: dict, note: LabelStudioNote) -> None:
def _format_ctakes_predictions(self, task: dict, note: LabelStudioNote) -> None:
if not note.ctakes_matches:
return

prediction = {
"model_version": "Cumulus", # TODO: do we want more specificity here?
"model_version": "Cumulus cTAKES",
}

used_labels = set()
results = []
count = 0
for match in note.matches:
for match in note.ctakes_matches:
matched_labels = {self._cui_labels.get(concept.cui) for concept in match.conceptAttributes}
matched_labels.discard(None) # drop the result of a concept not being in our bsv label set
if matched_labels:
results.append(self._format_match(count, match, matched_labels))
results.append(self._format_ctakes_match(count, match, matched_labels))
used_labels.update(matched_labels)
count += 1
prediction["result"] = results
task["predictions"].append(prediction)

self._update_used_labels(task, used_labels)

def _format_ctakes_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict:
return {
"id": f"ctakes{count}",
"from_name": self._labels_name,
"to_name": self._labels_config["to_name"][0],
"type": "labels",
"value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)},
}

def _format_philter_predictions(self, task: dict, note: LabelStudioNote) -> None:
"""
Adds a predication layer with philter spans.
Note that this does *not* update the running list of used labels.
This sets a "secret" / non-human-oriented label of "_philter".
Label Studio will still highlight the spans, and this way we won't
conflict with any existing labels.
"""
if not note.philter_map:
return

prediction = {
"model_version": "Cumulus Philter",
}

results = []
count = 0
for start, stop in sorted(note.philter_map.items()):
results.append(self._format_philter_span(count, start, stop, note))
count += 1
prediction["result"] = results

task["predictions"].append(prediction)

def _format_philter_span(self, count: int, start: int, end: int, note: LabelStudioNote) -> dict:
text = note.text[start:end]
return {
"id": f"philter{count}",
"from_name": self._labels_name,
"to_name": self._labels_config["to_name"][0],
"type": "labels",
# We hardcode the label "_philter" - Label Studio will still highlight unknown labels,
# and this is unlikely to collide with existing labels.
"value": {"start": start, "end": end, "score": 1.0, "text": text, "labels": ["_philter"]},
}

def _update_used_labels(self, task: dict, used_labels: Iterable[str]) -> None:
if self._labels_config.get("dynamic_labels"):
# This path supports configs like <Labels name="label" toName="text" value="$label"/> where the labels
# can be dynamically set by us.
Expand All @@ -130,15 +192,7 @@ def _format_prediction(self, task: dict, note: LabelStudioNote) -> None:
# actually kept in the config, so we have to make some assumptions about how the user set up their project.
#
# The rule that Cumulus uses is that the value= variable must equal the name= of the <Labels> element.
task["data"][self._labels_name] = [{"value": x} for x in sorted(used_labels)]

task["predictions"] = [prediction]

def _format_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict:
return {
"id": f"match{count}",
"from_name": self._labels_name,
"to_name": self._labels_config["to_name"][0],
"type": "labels",
"value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)},
}
existing_labels = task["data"].get(self._labels_name, [])
existing_labels = {d["value"] for d in existing_labels}
existing_labels.update(used_labels)
task["data"][self._labels_name] = [{"value": x} for x in sorted(existing_labels)]
2 changes: 1 addition & 1 deletion docs/chart-review.md
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ But you may get better results by adding extra terms and variations in your symp
## Disabling Features

You may not need NLP or `philter` processing.
Simply pass `--no-nlp` or `--no-philter` and those steps will be skipped.
Simply pass `--no-nlp` or `--philter=disable` and those steps will be skipped.

## Label Studio

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ tests = [
"coverage",
"ddt",
"freezegun",
"moto[server,s3]",
"moto[server,s3] >= 5.0",
"pytest",
"respx",
]
Expand Down
2 changes: 1 addition & 1 deletion tests/s3mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def setUp(self):
self.server = ThreadedMotoServer()
self.server.start()

s3mock = moto.mock_s3()
s3mock = moto.mock_aws()
self.addCleanup(s3mock.stop)
s3mock.start()

Expand Down
43 changes: 32 additions & 11 deletions tests/upload_notes/test_upload_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ async def run_upload_notes(
anon_docrefs=None,
docrefs=None,
nlp=True,
philter=True,
philter=None,
no_philter=None,
overwrite=False,
) -> None:
args = [
Expand All @@ -98,7 +99,9 @@ async def run_upload_notes(
args += ["--docrefs", docrefs]
if not nlp:
args += ["--no-nlp"]
if not philter:
if philter:
args += ["--philter", philter]
if no_philter:
args += ["--no-philter"]
if overwrite:
args += ["--overwrite"]
Expand Down Expand Up @@ -314,7 +317,7 @@ async def test_successful_push_to_label_studio(self):
],
"type": "SignSymptomMention",
},
tasks[0].matches[0].as_json(),
tasks[0].ctakes_matches[0].as_json(),
)

@ddt.data(True, False)
Expand All @@ -329,27 +332,45 @@ async def test_disabled_nlp(self):
tasks = self.ls_client.push_tasks.call_args[0][0]
self.assertGreater(len(tasks), 0)
for task in tasks:
self.assertEqual([], task.matches)

@ddt.data(True, False)
async def test_philter(self, run_philter):
self.assertEqual([], task.ctakes_matches)

@ddt.data(
({}, True), # default args
({"philter": "redact"}, True),
({"philter": "disable"}, False),
({"no_philter": True}, False),
)
@ddt.unpack
async def test_philter_redact(self, upload_args, expect_redacted):
notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes):
await self.run_upload_notes(philter=run_philter)
await self.run_upload_notes(**upload_args)

tasks = self.ls_client.push_tasks.call_args[0][0]
self.assertEqual(1, len(tasks))
task = tasks[0]

# Regardless of philter, we keep the original cTAKES match text
self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.matches})
self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.ctakes_matches})

if run_philter:
if expect_redacted:
expected_text = "**** ***** called on 10/13/2010" # we don't philter dates
else:
expected_text = "John Smith called on 10/13/2010"
self.assertEqual(self.wrap_note("My Title", expected_text), task.text)

async def test_philter_label(self):
notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes):
await self.run_upload_notes(philter="label")

tasks = self.ls_client.push_tasks.call_args[0][0]
self.assertEqual(1, len(tasks))
task = tasks[0]

# High span numbers because we insert some header text
self.assertEqual({93: 97, 98: 103}, task.philter_map)

@respx.mock(assert_all_mocked=False)
async def test_combined_encounter_offsets(self, respx_mock):
# use server notes just for ease of making fake ones
Expand Down Expand Up @@ -388,5 +409,5 @@ async def test_combined_encounter_offsets(self, respx_mock):
self.assertEqual("What's", note.text[match2a[0] : match2a[1]])
self.assertEqual("up", note.text[match2b[0] : match2b[1]])
self.assertEqual("doc?", note.text[match2c[0] : match2c[1]])
spans = {x.span().key() for x in note.matches}
spans = {x.span().key() for x in note.ctakes_matches}
self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans)
Loading
Loading