feat(upload-notes): add ability to tag philter spans, not redact them

This commit deprecates --no-philter in favor of --philter=disable and adds --philter=label (and the default --philter=redact). When --philter=label is requested, we send a prediction layer to Label Studio that highlights all the detected PHI spans for you. This is helpful when doing manual de-identification, since philter can do some of the grunt work for you.
smart-on-fhir · Mar 4, 2024 · dd9f3cb · dd9f3cb
1 parent 0d0db81
commit dd9f3cb
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 58 deletions.
diff --git a/cumulus_etl/deid/philter.py b/cumulus_etl/deid/philter.py
@@ -22,12 +22,22 @@ def __init__(self):
         filter_config = os.path.join(os.path.dirname(__file__), "philter-config.toml")
         self.filters = philter_lite.load_filters(filter_config)
 
+    def detect_phi(self, text: str) -> philter_lite.CoordinateMap:
+        """
+        Find PHI spans using Philter.
+
+        :param text: the text to scrub
+        :returns: a map of where the spans to scrub are
+        """
+        include_map, _, _ = philter_lite.detect_phi(text, self.filters)
+        return include_map
+
     def scrub_text(self, text: str) -> str:
         """
         Scrub text of PHI using Philter.
 
         :param text: the text to scrub
         :returns: the scrubbed text, with PHI replaced by asterisks ("*")
         """
-        include_map, _, _ = philter_lite.detect_phi(text, self.filters)
+        include_map = self.detect_phi(text)
         return philter_lite.transform_text_asterisk(text, include_map)
diff --git a/cumulus_etl/upload_notes/cli.py b/cumulus_etl/upload_notes/cli.py
@@ -12,6 +12,10 @@
 from cumulus_etl.upload_notes import downloader, selector
 from cumulus_etl.upload_notes.labelstudio import LabelStudioClient, LabelStudioNote
 
+PHILTER_DISABLE = "disable"
+PHILTER_REDACT = "redact"
+PHILTER_LABEL = "label"
+
 
 def init_checks(args: argparse.Namespace):
     """Do any external service checks necessary at the start"""
@@ -61,8 +65,9 @@ async def read_notes_from_ndjson(
     for docref in common.read_resource_ndjson(store.Root(dirname), "DocumentReference"):
         encounter_refs = docref.get("context", {}).get("encounter", [])
         if not encounter_refs:
-            # If a note doesn't have an encounter - we're in big trouble
-            raise SystemExit(f"DocumentReference {docref['id']} is missing encounter information.")
+            # If a note doesn't have an encounter - we can't group it with other docs
+            print(f"Skipping DocumentReference {docref['id']} as it lacks a linked encounter.")
+            continue
 
         encounter_ids.append(fhir.unref_resource(encounter_refs[0])[1])  # just use first encounter
         docrefs.append(docref)
@@ -117,17 +122,20 @@ async def run_nlp(notes: Collection[LabelStudioNote], args: argparse.Namespace)
             client=http_client,
             model=nlp.TransformerModel.NEGATION,
         )
-        note.matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]
+        note.ctakes_matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]
 
 
 def philter_notes(notes: Collection[LabelStudioNote], args: argparse.Namespace) -> None:
-    if not args.philter:
+    if args.philter == PHILTER_DISABLE:
         return
 
     common.print_header("Running philter...")
     philter = deid.Philter()
     for note in notes:
-        note.text = philter.scrub_text(note.text)
+        if args.philter == PHILTER_LABEL:
+            note.philter_map = philter.detect_phi(note.text).get_complement(note.text)
+        else:
+            note.text = philter.scrub_text(note.text)
 
 
 def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelStudioNote]:
@@ -146,7 +154,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
     # Group up the text into one big note
     for enc_id, enc_notes in by_encounter_id.items():
         grouped_text = ""
-        grouped_matches = []
+        grouped_ctakes_matches = []
+        grouped_philter_map = {}
         grouped_doc_mappings = {}
         grouped_doc_spans = {}
 
@@ -164,10 +173,13 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
             offset_doc_spans = {k: (v[0] + offset, v[1] + offset) for k, v in note.doc_spans.items()}
             grouped_doc_spans.update(offset_doc_spans)
 
-            for match in note.matches:
+            for match in note.ctakes_matches:
                 match.begin += offset
                 match.end += offset
-                grouped_matches.append(match)
+                grouped_ctakes_matches.append(match)
+
+            for start, stop in note.philter_map.items():
+                grouped_philter_map[start + offset] = stop + offset
 
         grouped_notes.append(
             LabelStudioNote(
@@ -176,7 +188,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
                 text=grouped_text,
                 doc_mappings=grouped_doc_mappings,
                 doc_spans=grouped_doc_spans,
-                matches=grouped_matches,
+                ctakes_matches=grouped_ctakes_matches,
+                philter_map=grouped_philter_map,
             )
         )
 
@@ -208,8 +221,16 @@ def define_upload_notes_parser(parser: argparse.ArgumentParser) -> None:
     parser.add_argument(
         "--export-to", metavar="PATH", help="Where to put exported documents (default is to delete after use)"
     )
+
+    parser.add_argument(
+        "--philter",
+        choices=[PHILTER_DISABLE, PHILTER_REDACT, PHILTER_LABEL],
+        default=PHILTER_REDACT,
+        help="Whether to use philter to redact/tag PHI",
+    )
+    # Old, simpler version of the above (feel free to remove after May 2024)
     parser.add_argument(
-        "--no-philter", action="store_false", dest="philter", default=True, help="Don’t run philter on notes"
+        "--no-philter", action="store_const", const=PHILTER_DISABLE, dest="philter", help=argparse.SUPPRESS
     )
 
     cli_utils.add_aws(parser)

diff --git a/cumulus_etl/upload_notes/labelstudio.py b/cumulus_etl/upload_notes/labelstudio.py
@@ -37,7 +37,10 @@ class LabelStudioNote:
     doc_spans: dict[str, tuple[int, int]] = dataclasses.field(default_factory=dict)
 
     # Matches found by cTAKES
-    matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)
+    ctakes_matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)
+
+    # Matches found by Philter
+    philter_map: dict[int, int] = dataclasses.field(default_factory=dict)
 
 
 class LabelStudioClient:
@@ -99,29 +102,88 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict:
                 "docref_mappings": note.doc_mappings,
                 "docref_spans": {k: list(v) for k, v in note.doc_spans.items()},  # json doesn't natively have tuples
             },
+            "predictions": [],
         }
 
-        self._format_prediction(task, note)
+        # Initialize any used labels in case we have a dynamic label config.
+        # Label Studio needs to see *something* here
+        self._update_used_labels(task, [])
+
+        self._format_ctakes_predictions(task, note)
+        self._format_philter_predictions(task, note)
 
         return task
 
-    def _format_prediction(self, task: dict, note: LabelStudioNote) -> None:
+    def _format_ctakes_predictions(self, task: dict, note: LabelStudioNote) -> None:
+        if not note.ctakes_matches:
+            return
+
         prediction = {
-            "model_version": "Cumulus",  # TODO: do we want more specificity here?
+            "model_version": "Cumulus cTAKES",
         }
 
         used_labels = set()
         results = []
         count = 0
-        for match in note.matches:
+        for match in note.ctakes_matches:
             matched_labels = {self._cui_labels.get(concept.cui) for concept in match.conceptAttributes}
             matched_labels.discard(None)  # drop the result of a concept not being in our bsv label set
             if matched_labels:
-                results.append(self._format_match(count, match, matched_labels))
+                results.append(self._format_ctakes_match(count, match, matched_labels))
                 used_labels.update(matched_labels)
                 count += 1
         prediction["result"] = results
+        task["predictions"].append(prediction)
+
+        self._update_used_labels(task, used_labels)
+
+    def _format_ctakes_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict:
+        return {
+            "id": f"ctakes{count}",
+            "from_name": self._labels_name,
+            "to_name": self._labels_config["to_name"][0],
+            "type": "labels",
+            "value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)},
+        }
+
+    def _format_philter_predictions(self, task: dict, note: LabelStudioNote) -> None:
+        """
+        Adds a predication layer with philter spans.
 
+        Note that this does *not* update the running list of used labels.
+        This sets a "secret" / non-human-oriented label of "_philter".
+        Label Studio will still highlight the spans, and this way we won't
+        conflict with any existing labels.
+        """
+        if not note.philter_map:
+            return
+
+        prediction = {
+            "model_version": "Cumulus Philter",
+        }
+
+        results = []
+        count = 0
+        for start, stop in sorted(note.philter_map.items()):
+            results.append(self._format_philter_span(count, start, stop, note))
+            count += 1
+        prediction["result"] = results
+
+        task["predictions"].append(prediction)
+
+    def _format_philter_span(self, count: int, start: int, end: int, note: LabelStudioNote) -> dict:
+        text = note.text[start:end]
+        return {
+            "id": f"philter{count}",
+            "from_name": self._labels_name,
+            "to_name": self._labels_config["to_name"][0],
+            "type": "labels",
+            # We hardcode the label "_philter" - Label Studio will still highlight unknown labels,
+            # and this is unlikely to collide with existing labels.
+            "value": {"start": start, "end": end, "score": 1.0, "text": text, "labels": ["_philter"]},
+        }
+
+    def _update_used_labels(self, task: dict, used_labels: Iterable[str]) -> None:
         if self._labels_config.get("dynamic_labels"):
             # This path supports configs like <Labels name="label" toName="text" value="$label"/> where the labels
             # can be dynamically set by us.
@@ -130,15 +192,7 @@ def _format_prediction(self, task: dict, note: LabelStudioNote) -> None:
             # actually kept in the config, so we have to make some assumptions about how the user set up their project.
             #
             # The rule that Cumulus uses is that the value= variable must equal the name= of the <Labels> element.
-            task["data"][self._labels_name] = [{"value": x} for x in sorted(used_labels)]
-
-        task["predictions"] = [prediction]
-
-    def _format_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict:
-        return {
-            "id": f"match{count}",
-            "from_name": self._labels_name,
-            "to_name": self._labels_config["to_name"][0],
-            "type": "labels",
-            "value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)},
-        }
+            existing_labels = task["data"].get(self._labels_name, [])
+            existing_labels = {d["value"] for d in existing_labels}
+            existing_labels.update(used_labels)
+            task["data"][self._labels_name] = [{"value": x} for x in sorted(existing_labels)]
diff --git a/docs/chart-review.md b/docs/chart-review.md
@@ -233,7 +233,7 @@ But you may get better results by adding extra terms and variations in your symp
 ## Disabling Features
 
 You may not need NLP or `philter` processing.
-Simply pass `--no-nlp` or `--no-philter` and those steps will be skipped.
+Simply pass `--no-nlp` or `--philter=disable` and those steps will be skipped.
 
 ## Label Studio
 

diff --git a/tests/upload_notes/test_upload_cli.py b/tests/upload_notes/test_upload_cli.py
@@ -77,7 +77,8 @@ async def run_upload_notes(
         anon_docrefs=None,
         docrefs=None,
         nlp=True,
-        philter=True,
+        philter=None,
+        no_philter=None,
         overwrite=False,
     ) -> None:
         args = [
@@ -98,7 +99,9 @@ async def run_upload_notes(
             args += ["--docrefs", docrefs]
         if not nlp:
             args += ["--no-nlp"]
-        if not philter:
+        if philter:
+            args += ["--philter", philter]
+        if no_philter:
             args += ["--no-philter"]
         if overwrite:
             args += ["--overwrite"]
@@ -314,7 +317,7 @@ async def test_successful_push_to_label_studio(self):
                 ],
                 "type": "SignSymptomMention",
             },
-            tasks[0].matches[0].as_json(),
+            tasks[0].ctakes_matches[0].as_json(),
         )
 
     @ddt.data(True, False)
@@ -329,27 +332,45 @@ async def test_disabled_nlp(self):
         tasks = self.ls_client.push_tasks.call_args[0][0]
         self.assertGreater(len(tasks), 0)
         for task in tasks:
-            self.assertEqual([], task.matches)
-
-    @ddt.data(True, False)
-    async def test_philter(self, run_philter):
+            self.assertEqual([], task.ctakes_matches)
+
+    @ddt.data(
+        ({}, True),  # default args
+        ({"philter": "redact"}, True),
+        ({"philter": "disable"}, False),
+        ({"no_philter": True}, False),
+    )
+    @ddt.unpack
+    async def test_philter_redact(self, upload_args, expect_redacted):
         notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
         with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes):
-            await self.run_upload_notes(philter=run_philter)
+            await self.run_upload_notes(**upload_args)
 
         tasks = self.ls_client.push_tasks.call_args[0][0]
         self.assertEqual(1, len(tasks))
         task = tasks[0]
 
         # Regardless of philter, we keep the original cTAKES match text
-        self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.matches})
+        self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.ctakes_matches})
 
-        if run_philter:
+        if expect_redacted:
             expected_text = "**** ***** called on 10/13/2010"  # we don't philter dates
         else:
             expected_text = "John Smith called on 10/13/2010"
         self.assertEqual(self.wrap_note("My Title", expected_text), task.text)
 
+    async def test_philter_label(self):
+        notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
+        with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes):
+            await self.run_upload_notes(philter="label")
+
+        tasks = self.ls_client.push_tasks.call_args[0][0]
+        self.assertEqual(1, len(tasks))
+        task = tasks[0]
+
+        # High span numbers because we insert some header text
+        self.assertEqual({93: 97, 98: 103}, task.philter_map)
+
     @respx.mock(assert_all_mocked=False)
     async def test_combined_encounter_offsets(self, respx_mock):
         # use server notes just for ease of making fake ones
@@ -388,5 +409,5 @@ async def test_combined_encounter_offsets(self, respx_mock):
         self.assertEqual("What's", note.text[match2a[0] : match2a[1]])
         self.assertEqual("up", note.text[match2b[0] : match2b[1]])
         self.assertEqual("doc?", note.text[match2c[0] : match2c[1]])
-        spans = {x.span().key() for x in note.matches}
+        spans = {x.span().key() for x in note.ctakes_matches}
         self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans)