Skip to content

Commit

Permalink
Merge pull request #285 from smart-on-fhir/mikix/chart-spans
Browse files Browse the repository at this point in the history
feat(chart-review): push internal docref spans to Label Studio
  • Loading branch information
mikix authored Oct 26, 2023
2 parents 24b3103 + 91a3112 commit c5c513c
Show file tree
Hide file tree
Showing 10 changed files with 137 additions and 24 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,11 @@ jobs:
pip install bandit[toml] pycodestyle pylint black==22.12.0
- name: Run pycodestyle
# E203: pycodestyle is a little too rigid about slices & whitespace
# See https://black.readthedocs.io/en/stable/the_black_code_style/current_style.html#slices
# W503: a default ignore that we are restoring
run: |
pycodestyle --max-line-length=120 .
pycodestyle --max-line-length=120 --ignore=E203,W503 .
- name: Run pylint
if: success() || failure() # still run pylint if above checks fail
Expand Down
4 changes: 4 additions & 0 deletions compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,19 +51,23 @@ services:
ctakes-covid:
extends: ctakes-covid-base
profiles:
- chart-review
- chart-review-gpu
- covid-symptom
- covid-symptom-gpu

cnlpt-negation:
image: smartonfhir/cnlp-transformers:negation-0.6.1-cpu
profiles:
- chart-review
- covid-symptom
networks:
- cumulus-etl

cnlpt-negation-gpu:
image: smartonfhir/cnlp-transformers:negation-0.6.1-gpu
profiles:
- chart-review-gpu
- covid-symptom-gpu
networks:
- cumulus-etl
Expand Down
40 changes: 33 additions & 7 deletions cumulus_etl/chart_review/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def init_checks(args: argparse.Namespace):

if args.nlp:
nlp.check_ctakes()
nlp.check_cnlpt()
nlp.check_negation_cnlpt()

if not cli_utils.is_url_available(args.label_studio_url, retry=False):
errors.fatal(
Expand Down Expand Up @@ -80,8 +80,18 @@ async def read_notes_from_ndjson(
anon_enc_id = codebook.fake_id("Encounter", enc_id)
doc_id = docrefs[i]["id"]
doc_mappings = {doc_id: codebook.fake_id("DocumentReference", doc_id)}

notes.append(LabelStudioNote(enc_id, anon_enc_id, doc_mappings, title, text))
doc_spans = {doc_id: (0, len(text))}

notes.append(
LabelStudioNote(
enc_id,
anon_enc_id,
doc_mappings=doc_mappings,
doc_spans=doc_spans,
title=title,
text=text,
)
)

return notes

Expand All @@ -101,7 +111,12 @@ async def run_nlp(notes: Collection[LabelStudioNote], args: argparse.Namespace)
ctakes_json = await ctakesclient.client.extract(note.text, client=http_client)
matches = ctakes_json.list_match(polarity=Polarity.pos)
spans = ctakes_json.list_spans(matches)
cnlpt_results = await ctakesclient.transformer.list_polarity(note.text, spans, client=http_client)
cnlpt_results = await ctakesclient.transformer.list_polarity(
note.text,
spans,
client=http_client,
model=nlp.TransformerModel.NEGATION,
)
note.matches = [match for i, match in enumerate(matches) if cnlpt_results[i] == Polarity.pos]


Expand Down Expand Up @@ -133,6 +148,7 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
grouped_text = ""
grouped_matches = []
grouped_doc_mappings = {}
grouped_doc_spans = {}

for note in enc_notes:
grouped_doc_mappings.update(note.doc_mappings)
Expand All @@ -145,14 +161,24 @@ def group_notes_by_encounter(notes: Collection[LabelStudioNote]) -> list[LabelSt
offset = len(grouped_text)
grouped_text += note.text

offset_doc_spans = {k: (v[0] + offset, v[1] + offset) for k, v in note.doc_spans.items()}
grouped_doc_spans.update(offset_doc_spans)

for match in note.matches:
match.begin += offset
match.end += offset
grouped_matches.append(match)

grouped_note = LabelStudioNote(enc_id, enc_notes[0].anon_id, grouped_doc_mappings, "", grouped_text)
grouped_note.matches = grouped_matches
grouped_notes.append(grouped_note)
grouped_notes.append(
LabelStudioNote(
enc_id,
enc_notes[0].anon_id,
text=grouped_text,
doc_mappings=grouped_doc_mappings,
doc_spans=grouped_doc_spans,
matches=grouped_matches,
)
)

return grouped_notes

Expand Down
2 changes: 1 addition & 1 deletion cumulus_etl/chart_review/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ async def _download_docrefs_from_real_ids(

# Grab identifiers for which specific docrefs we need
with common.read_csv(docref_csv) as reader:
docref_ids = {row["docref_id"] for row in reader}
docref_ids = sorted({row["docref_id"] for row in reader})

# Kick off a bunch of requests to the FHIR server for these documents
coroutines = [_request_docref(client, docref_id) for docref_id in docref_ids]
Expand Down
28 changes: 21 additions & 7 deletions cumulus_etl/chart_review/labelstudio.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""LabelStudio document annotation"""

import dataclasses
from collections.abc import Collection, Iterable

import ctakesclient.typesystem
Expand All @@ -17,14 +18,26 @@
###############################################################################


@dataclasses.dataclass
class LabelStudioNote:
def __init__(self, enc_id: str, anon_id: str, doc_mappings: dict[str, str], title: str, text: str):
self.enc_id = enc_id
self.anon_id = anon_id
self.doc_mappings = doc_mappings
self.title = title
self.text = text
self.matches: list[ctakesclient.typesystem.MatchText] = []
"""Holds all the data that Label Studio will need for a single note (or a single grouped encounter note)"""

enc_id: str # real Encounter ID
anon_id: str # anonymized Encounter ID
text: str = "" # text of the note, sent to Label Studio

# A title is only used when combining notes into one big encounter note. It's not sent to Label Studio.
title: str = ""

# Doc mappings is a dict of real DocRef ID -> anonymized DocRef ID of all contained notes, in order
doc_mappings: dict[str, str] = dataclasses.field(default_factory=dict)

# Doc spans indicate which bits of the text come from which DocRef - it will map real DocRef ID to a pair of
# "first character" (0-based) and "last character" (0-based, exclusive) - just like cTAKES match text spans.
doc_spans: dict[str, tuple[int, int]] = dataclasses.field(default_factory=dict)

# Matches found by cTAKES
matches: list[ctakesclient.typesystem.MatchText] = dataclasses.field(default_factory=list)


class LabelStudioClient:
Expand Down Expand Up @@ -84,6 +97,7 @@ def _format_task_for_note(self, note: LabelStudioNote) -> dict:
"enc_id": note.enc_id,
"anon_id": note.anon_id,
"docref_mappings": note.doc_mappings,
"docref_spans": {k: list(v) for k, v in note.doc_spans.items()}, # json doesn't natively have tuples
},
}

Expand Down
2 changes: 1 addition & 1 deletion cumulus_etl/nlp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Support code for NLP servers"""

from .extract import ctakes_extract, ctakes_httpx_client, list_polarity
from .extract import TransformerModel, ctakes_extract, ctakes_httpx_client, list_polarity
from .huggingface import hf_prompt, hf_info, llama2_prompt
from .utils import cache_wrapper, is_docref_valid
from .watcher import check_negation_cnlpt, check_term_exists_cnlpt, check_ctakes, restart_ctakes_with_bsv
17 changes: 16 additions & 1 deletion docs/chart-review.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@ Along the way, it can mark the note with NLP results and/or anonymize the note w
This is useful for not just actual chart reviews, but also for developing a custom NLP dictionary.
You can feed Cumulus ETL a custom NLP dictionary, review how it performs, and iterate upon it.

## Preliminary Label Studio Setup
## Preliminaries

### Label Studio Setup

This guide assumes you already have a local instance of Label Studio running.
They offer Docker images and reasonable
Expand All @@ -27,6 +29,19 @@ If you haven't set that up yet, go do that and come back.
The Cumulus team can help you with setting it up if you come talk to us,
but the rest of this guide will mostly deal with chart review mode itself.

### Dependent Services

Some features of chart review mode need external services (like cTAKES to run NLP).
Launch those before you begin using chart review:

```shell
export UMLS_API_KEY=your-umls-api-key
docker compose --profile chart-review up -d
```

Or if you have access to a GPU,
you can speed up the NLP by launching the GPU profile instead with `--profile chart-review-gpu`.

## Basic Operation

At its core, chart review mode is just another ETL (extract, transform, load) operation.
Expand Down
Empty file added tests/chart_review/__init__.py
Empty file.
50 changes: 45 additions & 5 deletions tests/test_chart_cli.py → tests/chart_review/test_chart_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ async def run_chart_review(
await cli.main(args)

@staticmethod
def make_docref(doc_id: str, text: str = None, content: list[dict] = None) -> dict:
def make_docref(doc_id: str, text: str = None, content: list[dict] = None, enc_id: str = None) -> dict:
if content is None:
text = text or "What's up doc?"
content = [
Expand All @@ -117,11 +117,12 @@ def make_docref(doc_id: str, text: str = None, content: list[dict] = None) -> di
}
]

enc_id = enc_id or f"enc-{doc_id}"
return {
"resourceType": "DocumentReference",
"id": doc_id,
"content": content,
"context": {"encounter": [{"reference": f"Encounter/enc-{doc_id}"}]},
"context": {"encounter": [{"reference": f"Encounter/{enc_id}"}]},
}

@staticmethod
Expand Down Expand Up @@ -332,9 +333,7 @@ async def test_disabled_nlp(self):

@ddt.data(True, False)
async def test_philter(self, run_philter):
notes = [
LabelStudioNote("EncID", "EncAnon", {"DocID": "DocAnon"}, "My Title", "John Smith called on 10/13/2010")
]
notes = [LabelStudioNote("EncID", "EncAnon", title="My Title", text="John Smith called on 10/13/2010")]
with mock.patch("cumulus_etl.chart_review.cli.read_notes_from_ndjson", return_value=notes):
await self.run_chart_review(philter=run_philter)

Expand All @@ -350,3 +349,44 @@ async def test_philter(self, run_philter):
else:
expected_text = "John Smith called on 10/13/2010"
self.assertEqual(self.wrap_note("My Title", expected_text), task.text)

@respx.mock(assert_all_mocked=False)
async def test_combined_encounter_offsets(self, respx_mock):
# use server notes just for ease of making fake ones
self.mock_read_url(respx_mock, "D1", enc_id="43")
self.mock_read_url(respx_mock, "D2", enc_id="43")
respx_mock.post(os.environ["URL_CTAKES_REST"]).pass_through() # ignore cTAKES

with tempfile.NamedTemporaryFile() as file:
self.write_real_docrefs(file.name, ["D1", "D2"])
await self.run_chart_review(input_path="https://localhost", docrefs=file.name)

notes = self.ls_client.push_tasks.call_args[0][0]
self.assertEqual(1, len(notes))
note = notes[0]

# Did we mark that both IDs occur in one note correctly?
self.assertEqual({"D1": ANON_D1, "D2": ANON_D2}, note.doc_mappings)

# Did we mark the internal docref spans correctly?
first_span = (93, 107)
second_span = (285, 299)
self.assertEqual("What's up doc?", note.text[first_span[0] : first_span[1]])
self.assertEqual("What's up doc?", note.text[second_span[0] : second_span[1]])
self.assertEqual({"D1": first_span, "D2": second_span}, note.doc_spans)

# Did we edit cTAKES results correctly?
match1a = (93, 99)
match1b = (100, 102)
match1c = (103, 107)
match2a = (285, 291)
match2b = (292, 294)
match2c = (295, 299)
self.assertEqual("What's", note.text[match1a[0] : match1a[1]])
self.assertEqual("up", note.text[match1b[0] : match1b[1]])
self.assertEqual("doc?", note.text[match1c[0] : match1c[1]])
self.assertEqual("What's", note.text[match2a[0] : match2a[1]])
self.assertEqual("up", note.text[match2b[0] : match2b[1]])
self.assertEqual("doc?", note.text[match2c[0] : match2c[1]])
spans = {x.span().key() for x in note.matches}
self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans)
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,14 @@ def setUp(self):

@staticmethod
def make_note(*, enc_id: str = "enc", matches: bool = True) -> LabelStudioNote:
note = LabelStudioNote(enc_id, "enc-anon", {"doc": "doc-anon"}, "Ignored Title", "Normal note text")
text = "Normal note text"
note = LabelStudioNote(
enc_id,
"enc-anon",
doc_mappings={"doc": "doc-anon"},
doc_spans={"doc": (0, len(text))},
text=text,
)
if matches:
note.matches = ctakesmock.fake_ctakes_extract(note.text).list_match(polarity=Polarity.pos)
return note
Expand Down Expand Up @@ -61,6 +68,7 @@ def test_basic_push(self):
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
},
"predictions": [
{
Expand Down Expand Up @@ -109,6 +117,7 @@ def test_no_matches(self):
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
},
"predictions": [
{
Expand All @@ -132,6 +141,7 @@ def test_dynamic_labels(self, label_type):
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
"mylabel": [
{"value": "Itch"},
{"value": "Nausea"},
Expand All @@ -151,6 +161,7 @@ def test_dynamic_labels_no_matches(self):
"enc_id": "enc",
"anon_id": "enc-anon",
"docref_mappings": {"doc": "doc-anon"},
"docref_spans": {"doc": [0, 16]},
"mylabel": [], # this needs to be sent, or the server will complain
},
self.get_pushed_task()["data"],
Expand Down

0 comments on commit c5c513c

Please sign in to comment.