Merge pull request #285 from smart-on-fhir/mikix/chart-spans

feat(chart-review): push internal docref spans to Label Studio
smart-on-fhir · Oct 26, 2023 · c5c513c · c5c513c
2 parents 24b3103 + 91a3112
commit c5c513c
Show file tree

Hide file tree

Showing 10 changed files with 137 additions and 24 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -69,8 +69,11 @@ jobs:
           pip install bandit[toml] pycodestyle pylint black==22.12.0
 
       - name: Run pycodestyle
+        # E203: pycodestyle is a little too rigid about slices & whitespace
+        #  See https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#slices
+        # W503: a default ignore that we are restoring
         run: |
-          pycodestyle --max-line-length=120 .
+          pycodestyle --max-line-length=120 --ignore=E203,W503 .
 
       - name: Run pylint
         if: success() || failure() # still run pylint if above checks fail

diff --git a/compose.yaml b/compose.yaml
@@ -51,19 +51,23 @@ services:
   ctakes-covid:
     extends: ctakes-covid-base
     profiles:
+      - chart-review
+      - chart-review-gpu
       - covid-symptom
       - covid-symptom-gpu
 
   cnlpt-negation:
     image: smartonfhir/cnlp-transformers:negation-0.6.1-cpu
     profiles:
+      - chart-review
       - covid-symptom
     networks:
       - cumulus-etl
 
   cnlpt-negation-gpu:
     image: smartonfhir/cnlp-transformers:negation-0.6.1-gpu
     profiles:
+      - chart-review-gpu
       - covid-symptom-gpu
     networks:
       - cumulus-etl

diff --git a/cumulus_etl/chart_review/cli.py b/cumulus_etl/chart_review/cli.py
@@ -20,7 +20,7 @@ def init_checks(args: argparse.Namespace):
 
     if args.nlp:
         nlp.check_ctakes()
-        nlp.check_cnlpt()
+        nlp.check_negation_cnlpt()
 
     if not cli_utils.is_url_available(args.label_studio_url, retry=False):
         errors.fatal(
@@ -80,8 +80,18 @@ async def read_notes_from_ndjson(
         anon_enc_id = codebook.fake_id("Encounter", enc_id)
         doc_id = docrefs[i]["id"]
         doc_mappings = {doc_id: codebook.fake_id("DocumentReference", doc_id)}
-
-        notes.append(LabelStudioNote(enc_id, anon_enc_id, doc_mappings, title, text))
+        doc_spans = {doc_id: (0, len(text))}
+
+        notes.append(
+            LabelStudioNote(
+                enc_id,
+                anon_enc_id,
+                doc_mappings=doc_mappings,
+                doc_spans=doc_spans,
+                title=title,
+                text=text,
+            )
+        )
 
     return notes
 
@@ -101,7 +111,12 @@ async def run_nlp(notes: Collection[LabelStudioNote], args: argparse.Namespace)
         ctakes_json = await ctakesclient.client.extract(note.text, client=http_client)
         matches = ctakes_json.list_match(polarity=Polarity.pos)
         spans = ctakes_json.list_spans(matches)
-        cnlpt_results = await ctakesclient.transformer.list_polarity(note.text, spans, client=http_client)
+        cnlpt_results = await ctakesclient.transformer.list_polarity(
+            note.text,
+            spans,
+            client=http_client,
+            model=nlp.TransformerModel.NEGATION,
+        )
         note.matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]
 
 
@@ -133,6 +148,7 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
         grouped_text = ""
         grouped_matches = []
         grouped_doc_mappings = {}
+        grouped_doc_spans = {}
 
         for note in enc_notes:
             grouped_doc_mappings.update(note.doc_mappings)
@@ -145,14 +161,24 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
             offset = len(grouped_text)
             grouped_text += note.text
 
+            offset_doc_spans = {k: (v[0] + offset, v[1] + offset) for k, v in note.doc_spans.items()}
+            grouped_doc_spans.update(offset_doc_spans)
+
             for match in note.matches:
                 match.begin += offset
                 match.end += offset
                 grouped_matches.append(match)
 
-        grouped_note = LabelStudioNote(enc_id, enc_notes[0].anon_id, grouped_doc_mappings, "", grouped_text)
-        grouped_note.matches = grouped_matches
-        grouped_notes.append(grouped_note)
+        grouped_notes.append(
+            LabelStudioNote(
+                enc_id,
+                enc_notes[0].anon_id,
+                text=grouped_text,
+                doc_mappings=grouped_doc_mappings,
+                doc_spans=grouped_doc_spans,
+                matches=grouped_matches,
+            )
+        )
 
     return grouped_notes
 

diff --git a/cumulus_etl/chart_review/downloader.py b/cumulus_etl/chart_review/downloader.py
@@ -68,7 +68,7 @@ async def _download_docrefs_from_real_ids(
 
     # Grab identifiers for which specific docrefs we need
     with common.read_csv(docref_csv) as reader:
-        docref_ids = {row["docref_id"] for row in reader}
+        docref_ids = sorted({row["docref_id"] for row in reader})
 
     # Kick off a bunch of requests to the FHIR server for these documents
     coroutines = [_request_docref(client, docref_id) for docref_id in docref_ids]

diff --git a/cumulus_etl/chart_review/labelstudio.py b/cumulus_etl/chart_review/labelstudio.py
@@ -1,5 +1,6 @@
 """LabelStudio document annotation"""
 
+import dataclasses
 from collections.abc import Collection, Iterable
 
 import ctakesclient.typesystem
@@ -17,14 +18,26 @@
 ###############################################################################
 
 
+@dataclasses.dataclass
 class LabelStudioNote:
-    def __init__(self, enc_id: str, anon_id: str, doc_mappings: dict[str, str], title: str, text: str):
-        self.enc_id = enc_id
-        self.anon_id = anon_id
-        self.doc_mappings = doc_mappings
-        self.title = title
-        self.text = text
-        self.matches: list[ctakesclient.typesystem.MatchText] = []
+    """Holds all the data that Label Studio will need for a single note (or a single grouped encounter note)"""
+
+    enc_id: str  # real Encounter ID
+    anon_id: str  # anonymized Encounter ID
+    text: str = ""  # text of the note, sent to Label Studio
+
+    # A title is only used when combining notes into one big encounter note. It's not sent to Label Studio.
+    title: str = ""
+
+    # Doc mappings is a dict of real DocRef ID -> anonymized DocRef ID of all contained notes, in order
+    doc_mappings: dict[str, str] = dataclasses.field(default_factory=dict)
+
+    # Doc spans indicate which bits of the text come from which DocRef - it will map real DocRef ID to a pair of
+    # "first character" (0-based) and "last character" (0-based, exclusive) - just like cTAKES match text spans.
+    doc_spans: dict[str, tuple[int, int]] = dataclasses.field(default_factory=dict)
+
+    # Matches found by cTAKES
+    matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)
 
 
 class LabelStudioClient:
@@ -84,6 +97,7 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict:
                 "enc_id": note.enc_id,
                 "anon_id": note.anon_id,
                 "docref_mappings": note.doc_mappings,
+                "docref_spans": {k: list(v) for k, v in note.doc_spans.items()},  # json doesn't natively have tuples
             },
         }
 

diff --git a/cumulus_etl/nlp/__init__.py b/cumulus_etl/nlp/__init__.py
@@ -1,6 +1,6 @@
 """Support code for NLP servers"""
 
-from .extract import ctakes_extract, ctakes_httpx_client, list_polarity
+from .extract import TransformerModel, ctakes_extract, ctakes_httpx_client, list_polarity
 from .huggingface import hf_prompt, hf_info, llama2_prompt
 from .utils import cache_wrapper, is_docref_valid
 from .watcher import check_negation_cnlpt, check_term_exists_cnlpt, check_ctakes, restart_ctakes_with_bsv
diff --git a/docs/chart-review.md b/docs/chart-review.md
@@ -17,7 +17,9 @@ Along the way, it can mark the note with NLP results and/or anonymize the note w
 This is useful for not just actual chart reviews, but also for developing a custom NLP dictionary.
 You can feed Cumulus ETL a custom NLP dictionary, review how it performs, and iterate upon it.
 
-## Preliminary Label Studio Setup
+## Preliminaries
+
+### Label Studio Setup
 
 This guide assumes you already have a local instance of Label Studio running.
 They offer Docker images and reasonable
@@ -27,6 +29,19 @@ If you haven't set that up yet, go do that and come back.
 The Cumulus team can help you with setting it up if you come talk to us,
 but the rest of this guide will mostly deal with chart review mode itself.
 
+### Dependent Services
+
+Some features of chart review mode need external services (like cTAKES to run NLP).
+Launch those before you begin using chart review:
+
+```shell
+export UMLS_API_KEY=your-umls-api-key
+docker compose --profile chart-review up -d
+```
+
+Or if you have access to a GPU,
+you can speed up the NLP by launching the GPU profile instead with `--profile chart-review-gpu`.
+
 ## Basic Operation
 
 At its core, chart review mode is just another ETL (extract, transform, load) operation.

diff --git a/tests/chart_review/__init__.py b/tests/chart_review/__init__.py
diff --git a/tests/test_chart_cli.py → tests/chart_review/test_chart_cli.py b/tests/test_chart_cli.py → tests/chart_review/test_chart_cli.py
@@ -105,7 +105,7 @@ async def run_chart_review(
         await cli.main(args)
 
     @staticmethod
-    def make_docref(doc_id: str, text: str = None, content: list[dict] = None) -> dict:
+    def make_docref(doc_id: str, text: str = None, content: list[dict] = None, enc_id: str = None) -> dict:
         if content is None:
             text = text or "What's up doc?"
             content = [
@@ -117,11 +117,12 @@ def make_docref(doc_id: str, text: str = None, content: list[dict] = None) -> di
                 }
             ]
 
+        enc_id = enc_id or f"enc-{doc_id}"
         return {
             "resourceType": "DocumentReference",
             "id": doc_id,
             "content": content,
-            "context": {"encounter": [{"reference": f"Encounter/enc-{doc_id}"}]},
+            "context": {"encounter": [{"reference": f"Encounter/{enc_id}"}]},
         }
 
     @staticmethod
@@ -332,9 +333,7 @@ async def test_disabled_nlp(self):
 
     @ddt.data(True, False)
     async def test_philter(self, run_philter):
-        notes = [
-            LabelStudioNote("EncID", "EncAnon", {"DocID": "DocAnon"}, "My Title", "John Smith called on 10/13/2010")
-        ]
+        notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
         with mock.patch("cumulus_etl.chart_review.cli.read_notes_from_ndjson", return_value=notes):
             await self.run_chart_review(philter=run_philter)
 
@@ -350,3 +349,44 @@ async def test_philter(self, run_philter):
         else:
             expected_text = "John Smith called on 10/13/2010"
         self.assertEqual(self.wrap_note("My Title", expected_text), task.text)
+
+    @respx.mock(assert_all_mocked=False)
+    async def test_combined_encounter_offsets(self, respx_mock):
+        # use server notes just for ease of making fake ones
+        self.mock_read_url(respx_mock, "D1", enc_id="43")
+        self.mock_read_url(respx_mock, "D2", enc_id="43")
+        respx_mock.post(os.environ["URL_CTAKES_REST"]).pass_through()  # ignore cTAKES
+
+        with tempfile.NamedTemporaryFile() as file:
+            self.write_real_docrefs(file.name, ["D1", "D2"])
+            await self.run_chart_review(input_path="https://localhost", docrefs=file.name)
+
+        notes = self.ls_client.push_tasks.call_args[0][0]
+        self.assertEqual(1, len(notes))
+        note = notes[0]
+
+        # Did we mark that both IDs occur in one note correctly?
+        self.assertEqual({"D1": ANON_D1, "D2": ANON_D2}, note.doc_mappings)
+
+        # Did we mark the internal docref spans correctly?
+        first_span = (93, 107)
+        second_span = (285, 299)
+        self.assertEqual("What's up doc?", note.text[first_span[0] : first_span[1]])
+        self.assertEqual("What's up doc?", note.text[second_span[0] : second_span[1]])
+        self.assertEqual({"D1": first_span, "D2": second_span}, note.doc_spans)
+
+        # Did we edit cTAKES results correctly?
+        match1a = (93, 99)
+        match1b = (100, 102)
+        match1c = (103, 107)
+        match2a = (285, 291)
+        match2b = (292, 294)
+        match2c = (295, 299)
+        self.assertEqual("What's", note.text[match1a[0] : match1a[1]])
+        self.assertEqual("up", note.text[match1b[0] : match1b[1]])
+        self.assertEqual("doc?", note.text[match1c[0] : match1c[1]])
+        self.assertEqual("What's", note.text[match2a[0] : match2a[1]])
+        self.assertEqual("up", note.text[match2b[0] : match2b[1]])
+        self.assertEqual("doc?", note.text[match2c[0] : match2c[1]])
+        spans = {x.span().key() for x in note.matches}
+        self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans)
diff --git a/tests/test_chart_labelstudio.py → tests/chart_review/test_chart_labelstudio.py b/tests/test_chart_labelstudio.py → tests/chart_review/test_chart_labelstudio.py
@@ -26,7 +26,14 @@ def setUp(self):
 
     @staticmethod
     def make_note(*, enc_id: str = "enc", matches: bool = True) -> LabelStudioNote:
-        note = LabelStudioNote(enc_id, "enc-anon", {"doc": "doc-anon"}, "Ignored Title", "Normal note text")
+        text = "Normal note text"
+        note = LabelStudioNote(
+            enc_id,
+            "enc-anon",
+            doc_mappings={"doc": "doc-anon"},
+            doc_spans={"doc": (0, len(text))},
+            text=text,
+        )
         if matches:
             note.matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos)
         return note
@@ -61,6 +68,7 @@ def test_basic_push(self):
                     "enc_id": "enc",
                     "anon_id": "enc-anon",
                     "docref_mappings": {"doc": "doc-anon"},
+                    "docref_spans": {"doc": [0, 16]},
                 },
                 "predictions": [
                     {
@@ -109,6 +117,7 @@ def test_no_matches(self):
                     "enc_id": "enc",
                     "anon_id": "enc-anon",
                     "docref_mappings": {"doc": "doc-anon"},
+                    "docref_spans": {"doc": [0, 16]},
                 },
                 "predictions": [
                     {
@@ -132,6 +141,7 @@ def test_dynamic_labels(self, label_type):
                 "enc_id": "enc",
                 "anon_id": "enc-anon",
                 "docref_mappings": {"doc": "doc-anon"},
+                "docref_spans": {"doc": [0, 16]},
                 "mylabel": [
                     {"value": "Itch"},
                     {"value": "Nausea"},
@@ -151,6 +161,7 @@ def test_dynamic_labels_no_matches(self):
                 "enc_id": "enc",
                 "anon_id": "enc-anon",
                 "docref_mappings": {"doc": "doc-anon"},
+                "docref_spans": {"doc": [0, 16]},
                 "mylabel": [],  # this needs to be sent, or the server will complain
             },
             self.get_pushed_task()["data"],