diff --git a/cumulus_etl/deid/philter.py b/cumulus_etl/deid/philter.py index bfbda4e7..1dd7accd 100644 --- a/cumulus_etl/deid/philter.py +++ b/cumulus_etl/deid/philter.py @@ -22,6 +22,16 @@ def __init__(self): filter_config = os.path.join(os.path.dirname(__file__), "philter-config.toml") self.filters = philter_lite.load_filters(filter_config) + def detect_phi(self, text: str) -> philter_lite.CoordinateMap: + """ + Find PHI spans using Philter. + + :param text: the text to scrub + :returns: a map of where the spans to scrub are + """ + include_map, _, _ = philter_lite.detect_phi(text, self.filters) + return include_map + def scrub_text(self, text: str) -> str: """ Scrub text of PHI using Philter. @@ -29,5 +39,5 @@ def scrub_text(self, text: str) -> str: :param text: the text to scrub :returns: the scrubbed text, with PHI replaced by asterisks ("*") """ - include_map, _, _ = philter_lite.detect_phi(text, self.filters) + include_map = self.detect_phi(text) return philter_lite.transform_text_asterisk(text, include_map) diff --git a/cumulus_etl/upload_notes/cli.py b/cumulus_etl/upload_notes/cli.py index e179d294..d840d8aa 100644 --- a/cumulus_etl/upload_notes/cli.py +++ b/cumulus_etl/upload_notes/cli.py @@ -12,6 +12,10 @@ from cumulus_etl.upload_notes import downloader, selector from cumulus_etl.upload_notes.labelstudio import LabelStudioClient, LabelStudioNote +PHILTER_DISABLE = "disable" +PHILTER_REDACT = "redact" +PHILTER_LABEL = "label" + def init_checks(args: argparse.Namespace): """Do any external service checks necessary at the start""" @@ -61,8 +65,9 @@ async def read_notes_from_ndjson( for docref in common.read_resource_ndjson(store.Root(dirname), "DocumentReference"): encounter_refs = docref.get("context", {}).get("encounter", []) if not encounter_refs: - # If a note doesn't have an encounter - we're in big trouble - raise SystemExit(f"DocumentReference {docref['id']} is missing encounter information.") + # If a note doesn't have an encounter - we can't group it with other docs + print(f"Skipping DocumentReference {docref['id']} as it lacks a linked encounter.") + continue encounter_ids.append(fhir.unref_resource(encounter_refs[0])[1]) # just use first encounter docrefs.append(docref) @@ -117,17 +122,20 @@ async def run_nlp(notes: Collection[LabelStudioNote], args: argparse.Namespace) client=http_client, model=nlp.TransformerModel.NEGATION, ) - note.matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos] + note.ctakes_matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos] def philter_notes(notes: Collection[LabelStudioNote], args: argparse.Namespace) -> None: - if not args.philter: + if args.philter == PHILTER_DISABLE: return common.print_header("Running philter...") philter = deid.Philter() for note in notes: - note.text = philter.scrub_text(note.text) + if args.philter == PHILTER_LABEL: + note.philter_map = philter.detect_phi(note.text).get_complement(note.text) + else: + note.text = philter.scrub_text(note.text) def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelStudioNote]: @@ -146,7 +154,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt # Group up the text into one big note for enc_id, enc_notes in by_encounter_id.items(): grouped_text = "" - grouped_matches = [] + grouped_ctakes_matches = [] + grouped_philter_map = {} grouped_doc_mappings = {} grouped_doc_spans = {} @@ -164,10 +173,13 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt offset_doc_spans = {k: (v[0] + offset, v[1] + offset) for k, v in note.doc_spans.items()} grouped_doc_spans.update(offset_doc_spans) - for match in note.matches: + for match in note.ctakes_matches: match.begin += offset match.end += offset - grouped_matches.append(match) + grouped_ctakes_matches.append(match) + + for start, stop in note.philter_map.items(): + grouped_philter_map[start + offset] = stop + offset grouped_notes.append( LabelStudioNote( @@ -176,7 +188,8 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt text=grouped_text, doc_mappings=grouped_doc_mappings, doc_spans=grouped_doc_spans, - matches=grouped_matches, + ctakes_matches=grouped_ctakes_matches, + philter_map=grouped_philter_map, ) ) @@ -208,8 +221,16 @@ def define_upload_notes_parser(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--export-to", metavar="PATH", help="Where to put exported documents (default is to delete after use)" ) + + parser.add_argument( + "--philter", + choices=[PHILTER_DISABLE, PHILTER_REDACT, PHILTER_LABEL], + default=PHILTER_REDACT, + help="Whether to use philter to redact/tag PHI", + ) + # Old, simpler version of the above (feel free to remove after May 2024) parser.add_argument( - "--no-philter", action="store_false", dest="philter", default=True, help="Don’t run philter on notes" + "--no-philter", action="store_const", const=PHILTER_DISABLE, dest="philter", help=argparse.SUPPRESS ) cli_utils.add_aws(parser) diff --git a/cumulus_etl/upload_notes/labelstudio.py b/cumulus_etl/upload_notes/labelstudio.py index e34686aa..d6439fdb 100644 --- a/cumulus_etl/upload_notes/labelstudio.py +++ b/cumulus_etl/upload_notes/labelstudio.py @@ -37,7 +37,10 @@ class LabelStudioNote: doc_spans: dict[str, tuple[int, int]] = dataclasses.field(default_factory=dict) # Matches found by cTAKES - matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list) + ctakes_matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list) + + # Matches found by Philter + philter_map: dict[int, int] = dataclasses.field(default_factory=dict) class LabelStudioClient: @@ -99,29 +102,88 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict: "docref_mappings": note.doc_mappings, "docref_spans": {k: list(v) for k, v in note.doc_spans.items()}, # json doesn't natively have tuples }, + "predictions": [], } - self._format_prediction(task, note) + # Initialize any used labels in case we have a dynamic label config. + # Label Studio needs to see *something* here + self._update_used_labels(task, []) + + self._format_ctakes_predictions(task, note) + self._format_philter_predictions(task, note) return task - def _format_prediction(self, task: dict, note: LabelStudioNote) -> None: + def _format_ctakes_predictions(self, task: dict, note: LabelStudioNote) -> None: + if not note.ctakes_matches: + return + prediction = { - "model_version": "Cumulus", # TODO: do we want more specificity here? + "model_version": "Cumulus cTAKES", } used_labels = set() results = [] count = 0 - for match in note.matches: + for match in note.ctakes_matches: matched_labels = {self._cui_labels.get(concept.cui) for concept in match.conceptAttributes} matched_labels.discard(None) # drop the result of a concept not being in our bsv label set if matched_labels: - results.append(self._format_match(count, match, matched_labels)) + results.append(self._format_ctakes_match(count, match, matched_labels)) used_labels.update(matched_labels) count += 1 prediction["result"] = results + task["predictions"].append(prediction) + + self._update_used_labels(task, used_labels) + + def _format_ctakes_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict: + return { + "id": f"ctakes{count}", + "from_name": self._labels_name, + "to_name": self._labels_config["to_name"][0], + "type": "labels", + "value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)}, + } + + def _format_philter_predictions(self, task: dict, note: LabelStudioNote) -> None: + """ + Adds a predication layer with philter spans. + Note that this does *not* update the running list of used labels. + This sets a "secret" / non-human-oriented label of "_philter". + Label Studio will still highlight the spans, and this way we won't + conflict with any existing labels. + """ + if not note.philter_map: + return + + prediction = { + "model_version": "Cumulus Philter", + } + + results = [] + count = 0 + for start, stop in sorted(note.philter_map.items()): + results.append(self._format_philter_span(count, start, stop, note)) + count += 1 + prediction["result"] = results + + task["predictions"].append(prediction) + + def _format_philter_span(self, count: int, start: int, end: int, note: LabelStudioNote) -> dict: + text = note.text[start:end] + return { + "id": f"philter{count}", + "from_name": self._labels_name, + "to_name": self._labels_config["to_name"][0], + "type": "labels", + # We hardcode the label "_philter" - Label Studio will still highlight unknown labels, + # and this is unlikely to collide with existing labels. + "value": {"start": start, "end": end, "score": 1.0, "text": text, "labels": ["_philter"]}, + } + + def _update_used_labels(self, task: dict, used_labels: Iterable[str]) -> None: if self._labels_config.get("dynamic_labels"): # This path supports configs like where the labels # can be dynamically set by us. @@ -130,15 +192,7 @@ def _format_prediction(self, task: dict, note: LabelStudioNote) -> None: # actually kept in the config, so we have to make some assumptions about how the user set up their project. # # The rule that Cumulus uses is that the value= variable must equal the name= of the element. - task["data"][self._labels_name] = [{"value": x} for x in sorted(used_labels)] - - task["predictions"] = [prediction] - - def _format_match(self, count: int, match: MatchText, labels: Iterable[str]) -> dict: - return { - "id": f"match{count}", - "from_name": self._labels_name, - "to_name": self._labels_config["to_name"][0], - "type": "labels", - "value": {"start": match.begin, "end": match.end, "score": 1.0, "text": match.text, "labels": list(labels)}, - } + existing_labels = task["data"].get(self._labels_name, []) + existing_labels = {d["value"] for d in existing_labels} + existing_labels.update(used_labels) + task["data"][self._labels_name] = [{"value": x} for x in sorted(existing_labels)] diff --git a/docs/chart-review.md b/docs/chart-review.md index dccb4e4d..293b3ed4 100644 --- a/docs/chart-review.md +++ b/docs/chart-review.md @@ -233,7 +233,7 @@ But you may get better results by adding extra terms and variations in your symp ## Disabling Features You may not need NLP or `philter` processing. -Simply pass `--no-nlp` or `--no-philter` and those steps will be skipped. +Simply pass `--no-nlp` or `--philter=disable` and those steps will be skipped. ## Label Studio diff --git a/pyproject.toml b/pyproject.toml index 502a852f..45c45e6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,7 @@ tests = [ "coverage", "ddt", "freezegun", - "moto[server,s3]", + "moto[server,s3] >= 5.0", "pytest", "respx", ] diff --git a/tests/s3mock.py b/tests/s3mock.py index 3ca089f8..befdc934 100644 --- a/tests/s3mock.py +++ b/tests/s3mock.py @@ -39,7 +39,7 @@ def setUp(self): self.server = ThreadedMotoServer() self.server.start() - s3mock = moto.mock_s3() + s3mock = moto.mock_aws() self.addCleanup(s3mock.stop) s3mock.start() diff --git a/tests/upload_notes/test_upload_cli.py b/tests/upload_notes/test_upload_cli.py index 365c2258..6468533f 100644 --- a/tests/upload_notes/test_upload_cli.py +++ b/tests/upload_notes/test_upload_cli.py @@ -77,7 +77,8 @@ async def run_upload_notes( anon_docrefs=None, docrefs=None, nlp=True, - philter=True, + philter=None, + no_philter=None, overwrite=False, ) -> None: args = [ @@ -98,7 +99,9 @@ async def run_upload_notes( args += ["--docrefs", docrefs] if not nlp: args += ["--no-nlp"] - if not philter: + if philter: + args += ["--philter", philter] + if no_philter: args += ["--no-philter"] if overwrite: args += ["--overwrite"] @@ -314,7 +317,7 @@ async def test_successful_push_to_label_studio(self): ], "type": "SignSymptomMention", }, - tasks[0].matches[0].as_json(), + tasks[0].ctakes_matches[0].as_json(), ) @ddt.data(True, False) @@ -329,27 +332,45 @@ async def test_disabled_nlp(self): tasks = self.ls_client.push_tasks.call_args[0][0] self.assertGreater(len(tasks), 0) for task in tasks: - self.assertEqual([], task.matches) - - @ddt.data(True, False) - async def test_philter(self, run_philter): + self.assertEqual([], task.ctakes_matches) + + @ddt.data( + ({}, True), # default args + ({"philter": "redact"}, True), + ({"philter": "disable"}, False), + ({"no_philter": True}, False), + ) + @ddt.unpack + async def test_philter_redact(self, upload_args, expect_redacted): notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")] with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes): - await self.run_upload_notes(philter=run_philter) + await self.run_upload_notes(**upload_args) tasks = self.ls_client.push_tasks.call_args[0][0] self.assertEqual(1, len(tasks)) task = tasks[0] # Regardless of philter, we keep the original cTAKES match text - self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.matches}) + self.assertEqual({"John", "Smith", "called"}, {m.text for m in task.ctakes_matches}) - if run_philter: + if expect_redacted: expected_text = "**** ***** called on 10/13/2010" # we don't philter dates else: expected_text = "John Smith called on 10/13/2010" self.assertEqual(self.wrap_note("My Title", expected_text), task.text) + async def test_philter_label(self): + notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")] + with mock.patch("cumulus_etl.upload_notes.cli.read_notes_from_ndjson", return_value=notes): + await self.run_upload_notes(philter="label") + + tasks = self.ls_client.push_tasks.call_args[0][0] + self.assertEqual(1, len(tasks)) + task = tasks[0] + + # High span numbers because we insert some header text + self.assertEqual({93: 97, 98: 103}, task.philter_map) + @respx.mock(assert_all_mocked=False) async def test_combined_encounter_offsets(self, respx_mock): # use server notes just for ease of making fake ones @@ -388,5 +409,5 @@ async def test_combined_encounter_offsets(self, respx_mock): self.assertEqual("What's", note.text[match2a[0] : match2a[1]]) self.assertEqual("up", note.text[match2b[0] : match2b[1]]) self.assertEqual("doc?", note.text[match2c[0] : match2c[1]]) - spans = {x.span().key() for x in note.matches} + spans = {x.span().key() for x in note.ctakes_matches} self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans) diff --git a/tests/upload_notes/test_upload_labelstudio.py b/tests/upload_notes/test_upload_labelstudio.py index c1cf1b8e..191e3e80 100644 --- a/tests/upload_notes/test_upload_labelstudio.py +++ b/tests/upload_notes/test_upload_labelstudio.py @@ -25,7 +25,7 @@ def setUp(self): self.ls_project.parsed_label_config = {"mylabel": {"type": "Labels", "to_name": ["mytext"]}} @staticmethod - def make_note(*, enc_id: str = "enc", matches: bool = True) -> LabelStudioNote: + def make_note(*, enc_id: str = "enc", ctakes: bool = True, philter_label: bool = True) -> LabelStudioNote: text = "Normal note text" note = LabelStudioNote( enc_id, @@ -34,8 +34,11 @@ def make_note(*, enc_id: str = "enc", matches: bool = True) -> LabelStudioNote: doc_spans={"doc": (0, len(text))}, text=text, ) - if matches: - note.matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos) + if ctakes: + note.ctakes_matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos) + if philter_label: + matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos) + note.philter_map = {m.begin: m.end for m in matches} return note @staticmethod @@ -72,12 +75,12 @@ def test_basic_push(self): }, "predictions": [ { - "model_version": "Cumulus", + "model_version": "Cumulus cTAKES", "result": [ # Note that fever does not show up, as it was not in our initial CUI mapping (in push_tasks) { "from_name": "mylabel", - "id": "match0", + "id": "ctakes0", "to_name": "mytext", "type": "labels", "value": { @@ -90,7 +93,7 @@ def test_basic_push(self): }, { "from_name": "mylabel", - "id": "match1", + "id": "ctakes1", "to_name": "mytext", "type": "labels", "value": { @@ -102,14 +105,58 @@ def test_basic_push(self): }, }, ], - } + }, + { + "model_version": "Cumulus Philter", + "result": [ + { + "from_name": "mylabel", + "id": "philter0", + "to_name": "mytext", + "type": "labels", + "value": { + "end": 6, + "labels": ["_philter"], + "score": 1.0, + "start": 0, + "text": "Normal", + }, + }, + { + "from_name": "mylabel", + "id": "philter1", + "to_name": "mytext", + "type": "labels", + "value": { + "end": 11, + "labels": ["_philter"], + "score": 1.0, + "start": 7, + "text": "note", + }, + }, + { + "from_name": "mylabel", + "id": "philter2", + "to_name": "mytext", + "type": "labels", + "value": { + "end": 16, + "labels": ["_philter"], + "score": 1.0, + "start": 12, + "text": "text", + }, + }, + ], + }, ], }, self.get_pushed_task(), ) - def test_no_matches(self): - self.push_tasks(self.make_note(matches=False)) + def test_no_predictions(self): + self.push_tasks(self.make_note(ctakes=False, philter_label=False)) self.assertEqual( { "data": { @@ -119,12 +166,7 @@ def test_no_matches(self): "docref_mappings": {"doc": "doc-anon"}, "docref_spans": {"doc": [0, 16]}, }, - "predictions": [ - { - "model_version": "Cumulus", - "result": [], - } - ], + "predictions": [], }, self.get_pushed_task(), ) @@ -150,11 +192,11 @@ def test_dynamic_labels(self, label_type): self.get_pushed_task()["data"], ) - def test_dynamic_labels_no_matches(self): + def test_dynamic_labels_no_predictions(self): self.ls_project.parsed_label_config = { "mylabel": {"type": "Labels", "to_name": ["mytext"], "dynamic_labels": True}, } - self.push_tasks(self.make_note(matches=False)) + self.push_tasks(self.make_note(ctakes=False, philter_label=False)) self.assertEqual( { "text": "Normal note text",