diff --git a/cumulus_etl/deid/philter.py b/cumulus_etl/deid/philter.py
index bfbda4e7..1dd7accd 100644
--- a/cumulus_etl/deid/philter.py
+++ b/cumulus_etl/deid/philter.py
@@ -22,6 +22,16 @@ def __init__(self):
filter_config = os.path.join(os.path.dirname(__file__), "philter-config.toml")
self.filters = philter_lite.load_filters(filter_config)
+ def detect_phi(self, text: str) -> philter_lite.CoordinateMap:
+ """
+ Find PHI spans using Philter.
+
+ :param text: the text to scrub
+ :returns: a map of where the spans to scrub are
+ """
+ include_map, _, _ = philter_lite.detect_phi(text, self.filters)
+ return include_map
+
def scrub_text(self, text: str) -> str:
"""
Scrub text of PHI using Philter.
@@ -29,5 +39,5 @@ def scrub_text(self, text: str) -> str:
:param text: the text to scrub
:returns: the scrubbed text, with PHI replaced by asterisks ("*")
"""
- include_map, _, _ = philter_lite.detect_phi(text, self.filters)
+ include_map = self.detect_phi(text)
return philter_lite.transform_text_asterisk(text, include_map)
diff --git a/cumulus_etl/upload_notes/cli.py b/cumulus_etl/upload_notes/cli.py
index e179d294..d840d8aa 100644
--- a/cumulus_etl/upload_notes/cli.py
+++ b/cumulus_etl/upload_notes/cli.py
@@ -12,6 +12,10 @@
from cumulus_etl.upload_notes import downloader, selector
from cumulus_etl.upload_notes.labelstudio import LabelStudioClient, LabelStudioNote
+PHILTER_DISABLE = "disable"
+PHILTER_REDACT = "redact"
+PHILTER_LABEL = "label"
+
def init_checks(args: argparse.Namespace):
"""Do any external service checks necessary at the start"""
@@ -61,8 +65,9 @@ async def read_notes_from_ndjson(
for docref in common.read_resource_ndjson(store.Root(dirname), "DocumentReference"):
encounter_refs = docref.get("context", {}).get("encounter", [])
if not encounter_refs:
- # If a note doesn't have an encounter - we're in big trouble
- raise SystemExit(f"DocumentReference {docref['id']} is missing encounter information.")
+ # If a note doesn't have an encounter - we can't group it with other docs
+ print(f"Skipping DocumentReference {docref['id']} as it lacks a linked encounter.")
+ continue
encounter_ids.append(fhir.unref_resource(encounter_refs[0])[1]) # just use first encounter
docrefs.append(docref)
@@ -117,17 +122,20 @@ async def run_nlp(notes: Collection[LabelStudioNote], args: argparse.Namespace)
client=http_client,
model=nlp.TransformerModel.NEGATION,
)
- note.matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]
+ note.ctakes_matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]
def philter_notes(notes: Collection[LabelStudioNote], args: argparse.Namespace) -> None:
- if not args.philter:
+ if args.philter == PHILTER_DISABLE:
return
common.print_header("Running philter...")
philter = deid.Philter()
for note in notes:
- note.text = philter.scrub_text(note.text)
+ if args.philter == PHILTER_LABEL:
+ note.philter_map = philter.detect_phi(note.text).get_complement(note.text)
+ else:
+ note.text = philter.scrub_text(note.text)
def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelStudioNote]:
@@ -146,7 +154,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
# Group up the text into one big note
for enc_id, enc_notes in by_encounter_id.items():
grouped_text = ""
- grouped_matches = []
+ grouped_ctakes_matches = []
+ grouped_philter_map = {}
grouped_doc_mappings = {}
grouped_doc_spans = {}
@@ -164,10 +173,13 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
offset_doc_spans = {k: (v[0] + offset, v[1] + offset) for k, v in note.doc_spans.items()}
grouped_doc_spans.update(offset_doc_spans)
- for match in note.matches:
+ for match in note.ctakes_matches:
match.begin += offset
match.end += offset
- grouped_matches.append(match)
+ grouped_ctakes_matches.append(match)
+
+ for start, stop in note.philter_map.items():
+ grouped_philter_map[start + offset] = stop + offset
grouped_notes.append(
LabelStudioNote(
@@ -176,7 +188,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
text=grouped_text,
doc_mappings=grouped_doc_mappings,
doc_spans=grouped_doc_spans,
- matches=grouped_matches,
+ ctakes_matches=grouped_ctakes_matches,
+ philter_map=grouped_philter_map,
)
)
@@ -208,8 +221,16 @@ def define_upload_notes_parser(parser: argparse.ArgumentParser) -> None:
parser.add_argument(
"--export-to", metavar="PATH", help="Where to put exported documents (default is to delete after use)"
)
+
+ parser.add_argument(
+ "--philter",
+ choices=[PHILTER_DISABLE, PHILTER_REDACT, PHILTER_LABEL],
+ default=PHILTER_REDACT,
+ help="Whether to use philter to redact/tag PHI",
+ )
+ # Old, simpler version of the above (feel free to remove after May 2024)
parser.add_argument(
- "--no-philter", action="store_false", dest="philter", default=True, help="Don’t run philter on notes"
+ "--no-philter", action="store_const", const=PHILTER_DISABLE, dest="philter", help=argparse.SUPPRESS
)
cli_utils.add_aws(parser)
diff --git a/cumulus_etl/upload_notes/labelstudio.py b/cumulus_etl/upload_notes/labelstudio.py
index e34686aa..d6439fdb 100644
--- a/cumulus_etl/upload_notes/labelstudio.py
+++ b/cumulus_etl/upload_notes/labelstudio.py
@@ -37,7 +37,10 @@ class LabelStudioNote:
doc_spans: dict[str, tuple[int, int]] = dataclasses.field(default_factory=dict)
# Matches found by cTAKES
- matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)
+ ctakes_matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)
+
+ # Matches found by Philter
+ philter_map: dict[int, int] = dataclasses.field(default_factory=dict)
class LabelStudioClient:
@@ -99,29 +102,88 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict:
"docref_mappings": note.doc_mappings,
"docref_spans": {k: list(v) for k, v in note.doc_spans.items()}, # json doesn't natively have tuples
},
+ "predictions": [],
}
- self._format_prediction(task, note)
+ # Initialize any used labels in case we have a dynamic label config.
+ # Label Studio needs to see *something* here
+ self._update_used_labels(task, [])
+
+ self._format_ctakes_predictions(task, note)
+ self._format_philter_predictions(task, note)
return task
- def _format_prediction(self, task: dict, note: LabelStudioNote) -> None:
+ def _format_ctakes_predictions(self, task: dict, note: LabelStudioNote) -> None:
+ if not note.ctakes_matches:
+ return
+
prediction = {
- "model_version": "Cumulus", # TODO: do we want more specificity here?
+ "model_version": "Cumulus cTAKES",
}
used_labels = set()
results = []
count = 0
- for match in note.matches:
+ for match in note.ctakes_matches:
matched_labels = {self._cui_labels.get(concept.cui) for concept in match.conceptAttributes}
matched_labels.discard(None) # drop the result of a concept not being in our bsv label set
if matched_labels:
- results.append(self._format_match(count, match, matched_labels))
+ results.append(self._format_ctakes_match(count, match, matched_labels))
used_labels.update(matched_labels)
count += 1
prediction["result"] = results
+ task["predictions"].append(prediction)
+
+ self._update_used_labels(task, used_labels)
+
+ def _format_ctakes_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict:
+ return {
+ "id": f"ctakes{count}",
+ "from_name": self._labels_name,
+ "to_name": self._labels_config["to_name"][0],
+ "type": "labels",
+ "value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)},
+ }
+
+ def _format_philter_predictions(self, task: dict, note: LabelStudioNote) -> None:
+ """
+ Adds a predication layer with philter spans.
+ Note that this does *not* update the running list of used labels.
+ This sets a "secret" / non-human-oriented label of "_philter".
+ Label Studio will still highlight the spans, and this way we won't
+ conflict with any existing labels.
+ """
+ if not note.philter_map:
+ return
+
+ prediction = {
+ "model_version": "Cumulus Philter",
+ }
+
+ results = []
+ count = 0
+ for start, stop in sorted(note.philter_map.items()):
+ results.append(self._format_philter_span(count, start, stop, note))
+ count += 1
+ prediction["result"] = results
+
+ task["predictions"].append(prediction)
+
+ def _format_philter_span(self, count: int, start: int, end: int, note: LabelStudioNote) -> dict:
+ text = note.text[start:end]
+ return {
+ "id": f"philter{count}",
+ "from_name": self._labels_name,
+ "to_name": self._labels_config["to_name"][0],
+ "type": "labels",
+ # We hardcode the label "_philter" - Label Studio will still highlight unknown labels,
+ # and this is unlikely to collide with existing labels.
+ "value": {"start": start, "end": end, "score": 1.0, "text": text, "labels": ["_philter"]},
+ }
+
+ def _update_used_labels(self, task: dict, used_labels: Iterable[str]) -> None:
if self._labels_config.get("dynamic_labels"):
# This path supports configs like where the labels
# can be dynamically set by us.
@@ -130,15 +192,7 @@ def _format_prediction(self, task: dict, note: LabelStudioNote) -> None:
# actually kept in the config, so we have to make some assumptions about how the user set up their project.
#
# The rule that Cumulus uses is that the value= variable must equal the name= of the element.
- task["data"][self._labels_name] = [{"value": x} for x in sorted(used_labels)]
-
- task["predictions"] = [prediction]
-
- def _format_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict:
- return {
- "id": f"match{count}",
- "from_name": self._labels_name,
- "to_name": self._labels_config["to_name"][0],
- "type": "labels",
- "value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)},
- }
+ existing_labels = task["data"].get(self._labels_name, [])
+ existing_labels = {d["value"] for d in existing_labels}
+ existing_labels.update(used_labels)
+ task["data"][self._labels_name] = [{"value": x} for x in sorted(existing_labels)]
diff --git a/docs/chart-review.md b/docs/chart-review.md
index dccb4e4d..293b3ed4 100644
--- a/docs/chart-review.md
+++ b/docs/chart-review.md
@@ -233,7 +233,7 @@ But you may get better results by adding extra terms and variations in your symp
## Disabling Features
You may not need NLP or `philter` processing.
-Simply pass `--no-nlp` or `--no-philter` and those steps will be skipped.
+Simply pass `--no-nlp` or `--philter=disable` and those steps will be skipped.
## Label Studio
diff --git a/pyproject.toml b/pyproject.toml
index 502a852f..45c45e6c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,7 +70,7 @@ tests = [
"coverage",
"ddt",
"freezegun",
- "moto[server,s3]",
+ "moto[server,s3] >= 5.0",
"pytest",
"respx",
]
diff --git a/tests/s3mock.py b/tests/s3mock.py
index 3ca089f8..befdc934 100644
--- a/tests/s3mock.py
+++ b/tests/s3mock.py
@@ -39,7 +39,7 @@ def setUp(self):
self.server = ThreadedMotoServer()
self.server.start()
- s3mock = moto.mock_s3()
+ s3mock = moto.mock_aws()
self.addCleanup(s3mock.stop)
s3mock.start()
diff --git a/tests/upload_notes/test_upload_cli.py b/tests/upload_notes/test_upload_cli.py
index 365c2258..6468533f 100644
--- a/tests/upload_notes/test_upload_cli.py
+++ b/tests/upload_notes/test_upload_cli.py
@@ -77,7 +77,8 @@ async def run_upload_notes(
anon_docrefs=None,
docrefs=None,
nlp=True,
- philter=True,
+ philter=None,
+ no_philter=None,
overwrite=False,
) -> None:
args = [
@@ -98,7 +99,9 @@ async def run_upload_notes(
args += ["--docrefs", docrefs]
if not nlp:
args += ["--no-nlp"]
- if not philter:
+ if philter:
+ args += ["--philter", philter]
+ if no_philter:
args += ["--no-philter"]
if overwrite:
args += ["--overwrite"]
@@ -314,7 +317,7 @@ async def test_successful_push_to_label_studio(self):
],
"type": "SignSymptomMention",
},
- tasks[0].matches[0].as_json(),
+ tasks[0].ctakes_matches[0].as_json(),
)
@ddt.data(True, False)
@@ -329,27 +332,45 @@ async def test_disabled_nlp(self):
tasks = self.ls_client.push_tasks.call_args[0][0]
self.assertGreater(len(tasks), 0)
for task in tasks:
- self.assertEqual([], task.matches)
-
- @ddt.data(True, False)
- async def test_philter(self, run_philter):
+ self.assertEqual([], task.ctakes_matches)
+
+ @ddt.data(
+ ({}, True), # default args
+ ({"philter": "redact"}, True),
+ ({"philter": "disable"}, False),
+ ({"no_philter": True}, False),
+ )
+ @ddt.unpack
+ async def test_philter_redact(self, upload_args, expect_redacted):
notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes):
- await self.run_upload_notes(philter=run_philter)
+ await self.run_upload_notes(**upload_args)
tasks = self.ls_client.push_tasks.call_args[0][0]
self.assertEqual(1, len(tasks))
task = tasks[0]
# Regardless of philter, we keep the original cTAKES match text
- self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.matches})
+ self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.ctakes_matches})
- if run_philter:
+ if expect_redacted:
expected_text = "**** ***** called on 10/13/2010" # we don't philter dates
else:
expected_text = "John Smith called on 10/13/2010"
self.assertEqual(self.wrap_note("My Title", expected_text), task.text)
+ async def test_philter_label(self):
+ notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
+ with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes):
+ await self.run_upload_notes(philter="label")
+
+ tasks = self.ls_client.push_tasks.call_args[0][0]
+ self.assertEqual(1, len(tasks))
+ task = tasks[0]
+
+ # High span numbers because we insert some header text
+ self.assertEqual({93: 97, 98: 103}, task.philter_map)
+
@respx.mock(assert_all_mocked=False)
async def test_combined_encounter_offsets(self, respx_mock):
# use server notes just for ease of making fake ones
@@ -388,5 +409,5 @@ async def test_combined_encounter_offsets(self, respx_mock):
self.assertEqual("What's", note.text[match2a[0] : match2a[1]])
self.assertEqual("up", note.text[match2b[0] : match2b[1]])
self.assertEqual("doc?", note.text[match2c[0] : match2c[1]])
- spans = {x.span().key() for x in note.matches}
+ spans = {x.span().key() for x in note.ctakes_matches}
self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans)
diff --git a/tests/upload_notes/test_upload_labelstudio.py b/tests/upload_notes/test_upload_labelstudio.py
index c1cf1b8e..191e3e80 100644
--- a/tests/upload_notes/test_upload_labelstudio.py
+++ b/tests/upload_notes/test_upload_labelstudio.py
@@ -25,7 +25,7 @@ def setUp(self):
self.ls_project.parsed_label_config = {"mylabel": {"type": "Labels", "to_name": ["mytext"]}}
@staticmethod
- def make_note(*, enc_id: str = "enc", matches: bool = True) -> LabelStudioNote:
+ def make_note(*, enc_id: str = "enc", ctakes: bool = True, philter_label: bool = True) -> LabelStudioNote:
text = "Normal note text"
note = LabelStudioNote(
enc_id,
@@ -34,8 +34,11 @@ def make_note(*, enc_id: str = "enc", matches: bool = True) -> LabelStudioNote:
doc_spans={"doc": (0, len(text))},
text=text,
)
- if matches:
- note.matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos)
+ if ctakes:
+ note.ctakes_matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos)
+ if philter_label:
+ matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos)
+ note.philter_map = {m.begin: m.end for m in matches}
return note
@staticmethod
@@ -72,12 +75,12 @@ def test_basic_push(self):
},
"predictions": [
{
- "model_version": "Cumulus",
+ "model_version": "Cumulus cTAKES",
"result": [
# Note that fever does not show up, as it was not in our initial CUI mapping (in push_tasks)
{
"from_name": "mylabel",
- "id": "match0",
+ "id": "ctakes0",
"to_name": "mytext",
"type": "labels",
"value": {
@@ -90,7 +93,7 @@ def test_basic_push(self):
},
{
"from_name": "mylabel",
- "id": "match1",
+ "id": "ctakes1",
"to_name": "mytext",
"type": "labels",
"value": {
@@ -102,14 +105,58 @@ def test_basic_push(self):
},
},
],
- }
+ },
+ {
+ "model_version": "Cumulus Philter",
+ "result": [
+ {
+ "from_name": "mylabel",
+ "id": "philter0",
+ "to_name": "mytext",
+ "type": "labels",
+ "value": {
+ "end": 6,
+ "labels": ["_philter"],
+ "score": 1.0,
+ "start": 0,
+ "text": "Normal",
+ },
+ },
+ {
+ "from_name": "mylabel",
+ "id": "philter1",
+ "to_name": "mytext",
+ "type": "labels",
+ "value": {
+ "end": 11,
+ "labels": ["_philter"],
+ "score": 1.0,
+ "start": 7,
+ "text": "note",
+ },
+ },
+ {
+ "from_name": "mylabel",
+ "id": "philter2",
+ "to_name": "mytext",
+ "type": "labels",
+ "value": {
+ "end": 16,
+ "labels": ["_philter"],
+ "score": 1.0,
+ "start": 12,
+ "text": "text",
+ },
+ },
+ ],
+ },
],
},
self.get_pushed_task(),
)
- def test_no_matches(self):
- self.push_tasks(self.make_note(matches=False))
+ def test_no_predictions(self):
+ self.push_tasks(self.make_note(ctakes=False, philter_label=False))
self.assertEqual(
{
"data": {
@@ -119,12 +166,7 @@ def test_no_matches(self):
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
},
- "predictions": [
- {
- "model_version": "Cumulus",
- "result": [],
- }
- ],
+ "predictions": [],
},
self.get_pushed_task(),
)
@@ -150,11 +192,11 @@ def test_dynamic_labels(self, label_type):
self.get_pushed_task()["data"],
)
- def test_dynamic_labels_no_matches(self):
+ def test_dynamic_labels_no_predictions(self):
self.ls_project.parsed_label_config = {
"mylabel": {"type": "Labels", "to_name": ["mytext"], "dynamic_labels": True},
}
- self.push_tasks(self.make_note(matches=False))
+ self.push_tasks(self.make_note(ctakes=False, philter_label=False))
self.assertEqual(
{
"text": "Normal note text",