diff --git a/compose.yaml b/compose.yaml
index 72018470..819f678e 100644
--- a/compose.yaml
+++ b/compose.yaml
@@ -13,7 +13,8 @@ services:
       - AWS_DEFAULT_PROFILE
       - CUMULUS_HUGGING_FACE_URL=http://llama2:8086/
       - URL_CTAKES_REST=http://ctakes-covid:8080/ctakes-web-rest/service/analyze
-      - URL_CNLP_NEGATION=http://cnlp-transformers:8000/negation/process
+      - URL_CNLP_NEGATION=http://cnlpt-negation:8000/negation/process
+      - URL_CNLP_TERM_EXISTS=http://cnlpt-term-exists:8000/termexists/process
     volumes:
       - $HOME/.aws/:/root/.aws/:ro
       - ctakes-overrides:/ctakes-overrides
@@ -30,12 +31,13 @@ services:
   cumulus-etl-gpu:
     extends: cumulus-etl-base
     environment:
-      - URL_CNLP_NEGATION=http://cnlp-transformers-gpu:8000/negation/process
+      - URL_CNLP_NEGATION=http://cnlpt-negation-gpu:8000/negation/process
+      - URL_CNLP_TERM_EXISTS=http://cnlpt-term-exists-gpu:8000/termexists/process
     profiles:
       - etl-gpu
 
   ctakes-covid-base:
-    image: smartonfhir/ctakes-covid:1.1
+    image: smartonfhir/ctakes-covid:1.1.0
     environment:
       - ctakes_umlsuser=umls_api_key 
       - ctakes_umlspw=$UMLS_API_KEY
@@ -49,20 +51,39 @@ services:
   ctakes-covid:
     extends: ctakes-covid-base
     profiles:
-      - etl-support
-      - etl-support-gpu
+      - covid-symptom
+      - covid-symptom-gpu
 
-  cnlp-transformers:
-    image: smartonfhir/cnlp-transformers:negation-0.4-cpu
+  cnlpt-negation:
+    image: smartonfhir/cnlp-transformers:negation-0.6.1-cpu
     profiles:
-      - etl-support
+      - covid-symptom
     networks:
       - cumulus-etl
 
-  cnlp-transformers-gpu:
-    image: smartonfhir/cnlp-transformers:negation-0.4-gpu
+  cnlpt-negation-gpu:
+    image: smartonfhir/cnlp-transformers:negation-0.6.1-gpu
     profiles:
-      - etl-support-gpu
+      - covid-symptom-gpu
+    networks:
+      - cumulus-etl
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities: [gpu]
+
+  cnlpt-term-exists:
+    image: smartonfhir/cnlp-transformers:termexists-0.6.1-cpu
+    profiles:
+      - covid-symptom
+    networks:
+      - cumulus-etl
+
+  cnlpt-term-exists-gpu:
+    image: smartonfhir/cnlp-transformers:termexists-0.6.1-gpu
+    profiles:
+      - covid-symptom-gpu
     networks:
       - cumulus-etl
     deploy:
@@ -111,7 +132,7 @@ services:
     volumes: 
       - ./:/cumulus-etl/
     working_dir: /cumulus-etl
-    command: 
+    command:
       - /cumulus-etl/tests/data/simple/ndjson-input 
       - /cumulus-etl/example-output 
       - /cumulus-etl/example-phi-build 
diff --git a/cumulus_etl/etl/studies/covid_symptom/__init__.py b/cumulus_etl/etl/studies/covid_symptom/__init__.py
index 6e5b7948..ec307d2c 100644
--- a/cumulus_etl/etl/studies/covid_symptom/__init__.py
+++ b/cumulus_etl/etl/studies/covid_symptom/__init__.py
@@ -1,3 +1,3 @@
 """The covid_symptom study"""
 
-from .covid_tasks import CovidSymptomNlpResultsTask
+from .covid_tasks import CovidSymptomNlpResultsTask, CovidSymptomNlpResultsTermExistsTask
diff --git a/cumulus_etl/etl/studies/covid_symptom/covid_ctakes.py b/cumulus_etl/etl/studies/covid_symptom/covid_ctakes.py
index 377ef088..d21ff2b9 100644
--- a/cumulus_etl/etl/studies/covid_symptom/covid_ctakes.py
+++ b/cumulus_etl/etl/studies/covid_symptom/covid_ctakes.py
@@ -4,15 +4,17 @@
 
 import ctakesclient
 import httpx
+from ctakesclient.transformer import TransformerModel
 
 from cumulus_etl import common, fhir, nlp, store
 
 
 async def covid_symptoms_extract(
-    client: fhir.FhirClient,
     cache: store.Root,
     docref: dict,
+    clinical_note: str,
     *,
+    polarity_model: TransformerModel,
     task_version: int,
     ctakes_http_client: httpx.AsyncClient = None,
     cnlp_http_client: httpx.AsyncClient = None,
@@ -20,9 +22,10 @@ async def covid_symptoms_extract(
     """
     Extract a list of Observations from NLP-detected symptoms in clinical notes
 
-    :param client: a client ready to talk to a FHIR server
     :param cache: Where to cache NLP results
-    :param docref: Clinical Note
+    :param docref: DocumentReference resource (scrubbed)
+    :param clinical_note: the clinical note already extracted from the docref
+    :param polarity_model: how to test the polarity of cTAKES responses
     :param task_version: version of task to inject into results
     :param ctakes_http_client: HTTPX client to use for the cTAKES server
     :param cnlp_http_client: HTTPX client to use for the cNLP transformer server
@@ -37,22 +40,22 @@ async def covid_symptoms_extract(
         return None
     _, encounter_id = fhir.unref_resource(encounters[0])
 
-    # Find the clinical note among the attachments
-    try:
-        clinical_note = await fhir.get_docref_note(client, docref)
-    except Exception as exc:  # pylint: disable=broad-except
-        logging.warning("Error getting text for docref %s: %s", docref_id, exc)
-        return None
-
     # cTAKES cache namespace history (and thus, cache invalidation history):
     #   v1: original cTAKES processing
     #   v2+: see CovidSymptomNlpResultsTask's version history
     ctakes_namespace = f"covid_symptom_v{task_version}"
 
-    # cNLP cache namespace history (and thus, cache invalidation history):
-    #   v1: original addition of cNLP filtering
-    #   v2: we started dropping non-covid symptoms, which changes the span ordering
-    cnlp_namespace = f"{ctakes_namespace}-cnlp_v2"
+    match polarity_model:
+        case TransformerModel.NEGATION:  # original
+            # cNLP cache namespace history (and thus, cache invalidation history):
+            #   v1: original addition of cNLP filtering
+            #   v2: we started dropping non-covid symptoms, which changes the span ordering
+            cnlp_namespace = f"{ctakes_namespace}-cnlp_v2"
+        case TransformerModel.TERM_EXISTS:
+            cnlp_namespace = f"{ctakes_namespace}-cnlp_term_exists_v1"
+        case _:
+            logging.warning("Unknown polarity method: %s", polarity_model.value)
+            return None
 
     timestamp = common.datetime_now().isoformat()
 
@@ -76,9 +79,11 @@ def is_covid_match(m: ctakesclient.typesystem.MatchText):
     # there too. We have found this to yield better results than cTAKES alone.
     try:
         spans = ctakes_json.list_spans(matches)
-        polarities_cnlp = await nlp.list_polarity(cache, cnlp_namespace, clinical_note, spans, client=cnlp_http_client)
+        polarities_cnlp = await nlp.list_polarity(
+            cache, cnlp_namespace, clinical_note, spans, model=polarity_model, client=cnlp_http_client
+        )
     except Exception as exc:  # pylint: disable=broad-except
-        logging.warning("Could not check negation for docref %s (%s): %s", docref_id, type(exc).__name__, exc)
+        logging.warning("Could not check polarity for docref %s (%s): %s", docref_id, type(exc).__name__, exc)
         return None
 
     # Now filter out any non-positive matches
diff --git a/cumulus_etl/etl/studies/covid_symptom/covid_tasks.py b/cumulus_etl/etl/studies/covid_symptom/covid_tasks.py
index 95db4791..b4081934 100644
--- a/cumulus_etl/etl/studies/covid_symptom/covid_tasks.py
+++ b/cumulus_etl/etl/studies/covid_symptom/covid_tasks.py
@@ -1,14 +1,13 @@
 """Define tasks for the covid_symptom study"""
 
-import copy
 import itertools
-import os
 
 import ctakesclient
 import pyarrow
 import rich.progress
+from ctakesclient.transformer import TransformerModel
 
-from cumulus_etl import common, formats, nlp, store
+from cumulus_etl import formats, nlp, store
 from cumulus_etl.etl import tasks
 from cumulus_etl.etl.studies.covid_symptom import covid_ctakes
 
@@ -58,42 +57,13 @@ def is_ed_docref(docref):
     return any(is_ed_coding(x) for x in codings)
 
 
-class CovidSymptomNlpResultsTask(tasks.EtlTask):
-    """Covid Symptom study task, to generate symptom lists from ED notes using NLP"""
+class BaseCovidSymptomNlpResultsTask(tasks.BaseNlpTask):
+    """Covid Symptom study task, to generate symptom lists from ED notes using cTAKES + a polarity check"""
 
-    name = "covid_symptom__nlp_results"
-    resource = "DocumentReference"
-    tags = {"covid_symptom", "gpu"}
-    needs_bulk_deid = False
-    outputs = [tasks.OutputTable(schema=None, group_field="docref_id")]
-
-    # Task Version
-    # The "task_version" field is a simple integer that gets incremented any time an NLP-relevant parameter is changed.
-    # This is a reference to a bundle of metadata (cTAKES version, cNLP version, ICD10 code list).
-    # We could combine all that info into a field we save with the results. But it's more human-friendly to have a
-    # simple version to refer to. So anytime these properties get changed, bump the version and record the old bundle
-    # of metadata too.
-    task_version = 2
-
-    # Task Version History:
-    # ** 2 (2023-08): Corrected the cache location (version 1 results might be using stale cache) **
-    #   cTAKES: smartonfhir/ctakes-covid:1.1
-    #   cNLP: smartonfhir/cnlp-transformers:negation-0.4
-    #   ctakesclient: 5.0
-    #
-    # ** 1 (2023-08): Updated ICD10 codes from ctakesclient **
-    #   cTAKES: smartonfhir/ctakes-covid:1.1
-    #   cNLP: smartonfhir/cnlp-transformers:negation-0.4
-    #   ctakesclient: 5.0
-    #
-    # ** null (before we added a task version) **
-    #   cTAKES: smartonfhir/ctakes-covid:1.1
-    #   cNLP: smartonfhir/cnlp-transformers:negation-0.4
-    #   ctakesclient: 3.0
+    # Subclasses: set name, tags, task_version, and polarity_model yourself
+    polarity_model = None
 
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.seen_docrefs = set()
+    outputs = [tasks.OutputTable(schema=None, group_field="docref_id")]
 
     async def prepare_task(self) -> bool:
         bsv_path = ctakesclient.filesystem.covid_symptoms_path()
@@ -103,15 +73,6 @@ async def prepare_task(self) -> bool:
             self.summaries[0].had_errors = True
         return success
 
-    def add_error(self, docref: dict) -> None:
-        if not self.task_config.dir_errors:
-            return
-
-        error_root = store.Root(os.path.join(self.task_config.dir_errors, self.name), create=True)
-        error_path = error_root.joinpath("nlp-errors.ndjson")
-        with common.NdjsonWriter(error_path, "a") as writer:
-            writer.write(docref)
-
     async def read_entries(self, *, progress: rich.progress.Progress = None) -> tasks.EntryIterator:
         """Passes clinical notes through NLP and returns any symptoms found"""
         phi_root = store.Root(self.task_config.dir_phi, create=True)
@@ -119,29 +80,17 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task
         # one client for both NLP services for now -- no parallel requests yet, so no need to be fancy
         http_client = nlp.ctakes_httpx_client()
 
-        for docref in self.read_ndjson(progress=progress):
-            if not nlp.is_docref_valid(docref):
-                continue
-
-            # Check that the note is one of our special allow-listed types (we do this here rather than on the output
-            # side to save needing to run everything through NLP).
-            if not is_ed_docref(docref):
-                continue
-
-            orig_docref = copy.deepcopy(docref)
-            if not self.scrubber.scrub_resource(docref, scrub_attachments=False):
-                continue
-
+        async for orig_docref, docref, clinical_note in self.read_notes(progress=progress, doc_check=is_ed_docref):
             symptoms = await covid_ctakes.covid_symptoms_extract(
-                self.task_config.client,
                 phi_root,
                 docref,
+                clinical_note,
+                polarity_model=self.polarity_model,
                 task_version=self.task_version,
                 ctakes_http_client=http_client,
                 cnlp_http_client=http_client,
             )
             if symptoms is None:
-                self.summaries[0].had_errors = True
                 self.add_error(orig_docref)
                 continue
 
@@ -159,11 +108,6 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task
             # The Format class will replace all existing symptoms from this note at once (because we set group_field).
             yield symptoms
 
-    def pop_current_group_values(self, table_index: int) -> set[str]:
-        values = self.seen_docrefs
-        self.seen_docrefs = set()
-        return values
-
     @classmethod
     def get_schema(cls, formatter: formats.Format, rows: list[dict]) -> pyarrow.Schema:
         return pyarrow.schema(
@@ -201,3 +145,51 @@ def get_schema(cls, formatter: formats.Format, rows: list[dict]) -> pyarrow.Sche
                 ),
             ]
         )
+
+
+class CovidSymptomNlpResultsTask(BaseCovidSymptomNlpResultsTask):
+    """Covid Symptom study task, to generate symptom lists from ED notes using cTAKES and cnlpt negation"""
+
+    name = "covid_symptom__nlp_results"
+    tags = {"covid_symptom", "gpu"}
+    polarity_model = TransformerModel.NEGATION
+
+    task_version = 3
+    # Task Version History:
+    # ** 3 (2023-09): Updated to cnlpt version 0.6.1 **
+    #   cTAKES: smartonfhir/ctakes-covid:1.1.0
+    #   cNLP: smartonfhir/cnlp-transformers:negation-0.6.1
+    #   ctakesclient: 5.0
+    #
+    # ** 2 (2023-08): Corrected the cache location (version 1 results might be using stale cache) **
+    #   cTAKES: smartonfhir/ctakes-covid:1.1.0
+    #   cNLP: smartonfhir/cnlp-transformers:negation-0.4.0
+    #   ctakesclient: 5.0
+    #
+    # ** 1 (2023-08): Updated ICD10 codes from ctakesclient **
+    #   cTAKES: smartonfhir/ctakes-covid:1.1.0
+    #   cNLP: smartonfhir/cnlp-transformers:negation-0.4.0
+    #   ctakesclient: 5.0
+    #
+    # ** null (before we added a task version) **
+    #   cTAKES: smartonfhir/ctakes-covid:1.1.0
+    #   cNLP: smartonfhir/cnlp-transformers:negation-0.4.0
+    #   ctakesclient: 3.0
+
+
+class CovidSymptomNlpResultsTermExistsTask(BaseCovidSymptomNlpResultsTask):
+    """Covid Symptom study task, to generate symptom lists from ED notes using cTAKES and cnlpt termexists"""
+
+    name = "covid_symptom__nlp_results_term_exists"
+    polarity_model = TransformerModel.TERM_EXISTS
+
+    # Explicitly don't use any tags because this is really a "hidden" task that is mostly for comparing
+    # polarity model performance more than running a study. So we don't want it to be accidentally run.
+    tags = {}
+
+    task_version = 1
+    # Task Version History:
+    # ** 1 (2023-09): First version **
+    #   cTAKES: smartonfhir/ctakes-covid:1.1.0
+    #   cNLP: smartonfhir/cnlp-transformers:termexists-0.6.1
+    #   ctakesclient: 5.0
diff --git a/cumulus_etl/etl/studies/hftest/hf_tasks.py b/cumulus_etl/etl/studies/hftest/hf_tasks.py
index a64f1386..588879f0 100644
--- a/cumulus_etl/etl/studies/hftest/hf_tasks.py
+++ b/cumulus_etl/etl/studies/hftest/hf_tasks.py
@@ -6,26 +6,16 @@
 import pyarrow
 import rich.progress
 
-from cumulus_etl import common, fhir, formats, nlp
+from cumulus_etl import common, formats, nlp
 from cumulus_etl.etl import tasks
 
 
-class HuggingFaceTestTask(tasks.EtlTask):
+class HuggingFaceTestTask(tasks.BaseNlpTask):
     """Hugging Face Test study task, to generate a summary from text"""
 
     name = "hftest__summary"
-    resource = "DocumentReference"
-    needs_bulk_deid = False
-    outputs = [tasks.OutputTable(schema=None)]
-
-    # Task Version
-    # The "task_version" field is a simple integer that gets incremented any time an NLP-relevant parameter is changed.
-    # This is a reference to a bundle of metadata (model revision, container revision, prompt string).
-    # We could combine all that info into a field we save with the results. But it's more human-friendly to have a
-    # simple version to refer to. So anytime these properties get changed, bump the version and record the old bundle
-    # of metadata too. Also update the safety checks in prepare_task()
-    task_version = 0
 
+    task_version = 0
     # Task Version History:
     # ** 0 **
     #   This is fluid until we actually promote this to a real task - feel free to update without bumping the version.
@@ -66,18 +56,7 @@ async def read_entries(self, *, progress: rich.progress.Progress = None) -> task
         """Passes clinical notes through HF and returns any symptoms found"""
         http_client = httpx.AsyncClient(timeout=300)
 
-        for docref in self.read_ndjson(progress=progress):
-            can_process = nlp.is_docref_valid(docref) and self.scrubber.scrub_resource(docref, scrub_attachments=False)
-            if not can_process:
-                continue
-
-            try:
-                clinical_note = await fhir.get_docref_note(self.task_config.client, docref)
-            except Exception as exc:  # pylint: disable=broad-except
-                logging.warning("Error getting text for docref %s: %s", docref["id"], exc)
-                self.summaries[0].had_errors = True
-                continue
-
+        async for _, docref, clinical_note in self.read_notes(progress=progress):
             timestamp = common.datetime_now().isoformat()
 
             # If you change this prompt, consider updating task_version.
diff --git a/cumulus_etl/etl/tasks/__init__.py b/cumulus_etl/etl/tasks/__init__.py
index acbb894f..601a39fa 100644
--- a/cumulus_etl/etl/tasks/__init__.py
+++ b/cumulus_etl/etl/tasks/__init__.py
@@ -1,4 +1,7 @@
 """Task support for the ETL workflow"""
 
 from .base import EntryIterator, EtlTask, OutputTable
+from .nlp_task import BaseNlpTask
+
+# Import this last because importing specific tasks will want the above classes to be available
 from .factory import get_all_tasks, get_default_tasks, get_selected_tasks
diff --git a/cumulus_etl/etl/tasks/factory.py b/cumulus_etl/etl/tasks/factory.py
index 13cab199..97d28a43 100644
--- a/cumulus_etl/etl/tasks/factory.py
+++ b/cumulus_etl/etl/tasks/factory.py
@@ -29,6 +29,8 @@ def get_all_tasks() -> list[type[AnyTask]]:
     # Right now, just hard-code these. One day we might allow plugins or something similarly dynamic.
     # Note: tasks will be run in the order listed here.
     return get_default_tasks() + [
+        covid_symptom.CovidSymptomNlpResultsTask,
+        covid_symptom.CovidSymptomNlpResultsTermExistsTask,
         hftest.HuggingFaceTestTask,
     ]
 
@@ -52,7 +54,6 @@ def get_default_tasks() -> list[type[AnyTask]]:
         ObservationTask,
         ProcedureTask,
         ServiceRequestTask,
-        covid_symptom.CovidSymptomNlpResultsTask,  # TODO: remove from default list at some point
     ]
 
 
diff --git a/cumulus_etl/etl/tasks/nlp_task.py b/cumulus_etl/etl/tasks/nlp_task.py
new file mode 100644
index 00000000..f340db05
--- /dev/null
+++ b/cumulus_etl/etl/tasks/nlp_task.py
@@ -0,0 +1,83 @@
+"""Base NLP task support"""
+
+import copy
+import logging
+import os
+from typing import Callable
+
+import rich.progress
+
+from cumulus_etl import common, fhir, nlp, store
+from cumulus_etl.etl.tasks.base import EtlTask, OutputTable
+
+
+class BaseNlpTask(EtlTask):
+    """Base class for any clinical-notes-based NLP task."""
+
+    resource = "DocumentReference"
+    needs_bulk_deid = False
+
+    # You may want to override these in your subclass
+    outputs = [OutputTable(schema=None)]  # maybe a group_field? (remember to call self.seen_docrefs.add() if so)
+    tags = {"gpu"}  # maybe a study identifier?
+
+    # Task Version
+    # The "task_version" field is a simple integer that gets incremented any time an NLP-relevant parameter is changed.
+    # This is a reference to a bundle of metadata (model revision, container revision, prompt string).
+    # We could combine all that info into a field we save with the results. But it's more human-friendly to have a
+    # simple version to refer to.
+    #
+    # CONSIDERATIONS WHEN CHANGING THIS:
+    # - Record the new bundle of metadata in your class documentation
+    # - Update any safety checks in prepare_task() or elsewhere that check the NLP versioning
+    # - Be aware that your caching will be reset
+    task_version = 1
+    # Task Version History:
+    # ** 1 (20xx-xx): First version **
+    #   CHANGE ME
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.seen_docrefs = set()
+
+    def pop_current_group_values(self, table_index: int) -> set[str]:
+        values = self.seen_docrefs
+        self.seen_docrefs = set()
+        return values
+
+    def add_error(self, docref: dict) -> None:
+        self.summaries[0].had_errors = True
+
+        if not self.task_config.dir_errors:
+            return
+        error_root = store.Root(os.path.join(self.task_config.dir_errors, self.name), create=True)
+        error_path = error_root.joinpath("nlp-errors.ndjson")
+        with common.NdjsonWriter(error_path, "a") as writer:
+            writer.write(docref)
+
+    async def read_notes(
+        self, *, doc_check: Callable[[dict], bool] = None, progress: rich.progress.Progress = None
+    ) -> (dict, dict, str):
+        """
+        Iterate through clinical notes.
+
+        :returns: a tuple of original-docref, scrubbed-docref, and clinical note
+        """
+        for docref in self.read_ndjson(progress=progress):
+            orig_docref = copy.deepcopy(docref)
+            can_process = (
+                nlp.is_docref_valid(docref)
+                and (doc_check is None or doc_check(docref))
+                and self.scrubber.scrub_resource(docref, scrub_attachments=False)
+            )
+            if not can_process:
+                continue
+
+            try:
+                clinical_note = await fhir.get_docref_note(self.task_config.client, docref)
+            except Exception as exc:  # pylint: disable=broad-except
+                logging.warning("Error getting text for docref %s: %s", docref["id"], exc)
+                self.add_error(orig_docref)
+                continue
+
+            yield orig_docref, docref, clinical_note
diff --git a/cumulus_etl/nlp/extract.py b/cumulus_etl/nlp/extract.py
index e267af72..08ae9ab8 100644
--- a/cumulus_etl/nlp/extract.py
+++ b/cumulus_etl/nlp/extract.py
@@ -5,6 +5,7 @@
 
 import ctakesclient
 import httpx
+from ctakesclient.transformer import TransformerModel
 
 from cumulus_etl import common, store
 
@@ -42,6 +43,7 @@ async def list_polarity(
     sentence: str,
     spans: list[tuple],
     client: httpx.AsyncClient = None,
+    model: TransformerModel = TransformerModel.NEGATION,
 ) -> list[ctakesclient.typesystem.Polarity]:
     """
     This is a version of ctakesclient.transformer.list_polarity() that also uses a cache
@@ -57,7 +59,7 @@ async def list_polarity(
     try:
         result = [ctakesclient.typesystem.Polarity(x) for x in common.read_json(full_path)]
     except Exception:  # pylint: disable=broad-except
-        result = await ctakesclient.transformer.list_polarity(sentence, spans, client=client)
+        result = await ctakesclient.transformer.list_polarity(sentence, spans, client=client, model=model)
         cache.makedirs(os.path.dirname(full_path))
         common.write_json(full_path, [x.value for x in result])
 
diff --git a/docs/performance.md b/docs/performance.md
index 6fe422b0..ce5dd05d 100644
--- a/docs/performance.md
+++ b/docs/performance.md
@@ -91,14 +91,14 @@ Cumulus ETL is deployed with Docker images.
 And because of the way Docker interacts with the GPU, we define a whole second set of profiles for GPU usage.
 
 Normally, you specify profile & image names in a couple places:
-1. When starting the support tools (cTAKES etc):<br>
-`docker compose --profile etl-support up`
+1. When starting a study's support tools (cTAKES etc):<br>
+`docker compose --profile covid-symptom up`
 1. When running the ETL tool:<br>
 `docker compose run cumulus-etl`
 
 To work with the GPU version of Cumulus ETL, just add `-gpu` to each of those names
 wherever they appear in [instructions](sample-runs.md):
-1. <code>docker compose --profile etl-support<b>-gpu</b> up</code>
+1. <code>docker compose --profile covid-symptom<b>-gpu</b> up</code>
 1. <code>docker compose run cumulus-etl<b>-gpu</b></code>
 
 ### Cloud Access to a GPU
diff --git a/docs/setup/sample-runs.md b/docs/setup/sample-runs.md
index 2915a46a..67dcfdbd 100644
--- a/docs/setup/sample-runs.md
+++ b/docs/setup/sample-runs.md
@@ -112,7 +112,6 @@ Once you've done that, you'll need the UMLS key mentioned at the top of this doc
 to start the network (here we're setting the UMLS_API_KEY, which cTAKES requires):
 ```sh
 export UMLS_API_KEY=your-umls-api-key
-docker compose -f $CUMULUS_REPO_PATH/compose.yaml --profile etl-support up -d
 ```
 
 The compose file will handle the environment variable mapping and volume mounts for you.
diff --git a/docs/studies/covid-symptom.md b/docs/studies/covid-symptom.md
new file mode 100644
index 00000000..3de8d0c2
--- /dev/null
+++ b/docs/studies/covid-symptom.md
@@ -0,0 +1,50 @@
+---
+title: Covid Symptom
+parent: Studies
+grand_parent: ETL
+nav_order: 1
+# audience: engineer familiar with the project
+# type: howto
+---
+
+# The Covid Symptom Study
+
+This study uses NLP to identify symptoms of COVID-19 in clinical notes.
+Specifically, it uses [cTAKES](https://ctakes.apache.org/) and
+[cNLP transformers](https://github.com/Machine-Learning-for-Medical-Language/cnlp_transformers)
+to identify clinical terms.
+
+## Preparation
+
+Because it uses external services like cTAKES, you will want to make sure those are ready.
+From your git clone of the `cumulus-etl` repo, you can run the following to run those services:
+```shell
+docker compose --profile covid-symptom-gpu up -d
+```
+
+You'll notice the `-gpu` suffix there.
+Running NLP is much, much faster with access to a GPU,
+so we strongly recommend you run this on GPU-enabled hardware.
+
+And since we _are_ running the GPU profile, when you do run the ETL,
+you'll want to launch the GPU mode instead of the default `cumulus-etl` CPU mode:
+```shell
+docker compose run cumulus-etl-gpu …
+```
+
+But if you can't use a GPU or you just want to test things out,
+you can use `--profile covid-symptom` above and the normal `cumulus-etl` run line to use the CPU.
+
+## Tasks
+
+There is one main task, run with `--task covid_symptom__nlp_results`.
+
+This will need access to clinical notes,
+which are pulled fresh from your EHR (since the ETL doesn't store clinical notes).
+This means you will likely have to provide some other FHIR authentication arguments like
+`--smart-client-id` and `--fhir-url`.
+See `--help` for more authentication options.
+
+There is also a second optional task `--task covid_symptom__nlp_results_term_exists`,
+which just uses a different polarity cNLP transformer (`termexists` rather than `negation`).
+You likely don't need both, but they may be interesting to compare.
diff --git a/docs/studies/index.md b/docs/studies/index.md
new file mode 100644
index 00000000..c0126875
--- /dev/null
+++ b/docs/studies/index.md
@@ -0,0 +1,21 @@
+---
+title: Studies
+parent: ETL
+nav_order: 3
+has_children: true
+# audience: engineer familiar with the project
+# type: howto
+---
+
+# Study-specific Tasks
+
+In addition to the default basic-FHIR-oriented Cumulus ETL tasks like `condition`,
+which simply strip identifying information and largely leaves the FHIR alone,
+there are also more interesting study-oriented tasks.
+
+These tend to be NLP tasks that extract information from clinical notes.
+
+They aren't run by default,
+but you can provide the ones you are interested in with the `--task` parameter.
+
+In this folder, you can read further explanations of how to run each built-in study.
diff --git a/pyproject.toml b/pyproject.toml
index 6294fe0c..827f44c8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,7 @@ requires-python = ">= 3.10"
 # open-pinned dependencies so that we're more likely to notice new releases (we'll still have time
 # to fix any breakages since users won't immediately see the problem).
 dependencies = [
-    "ctakesclient >= 5, < 6",
+    "ctakesclient >= 5.1, < 6",
     "delta-spark >= 2.3, < 3",
     "fhirclient < 5",
     "httpx < 1",
@@ -20,7 +20,7 @@ dependencies = [
     "label-studio-sdk < 1",
     "oracledb < 2",
     "philter-lite < 1",
-    "pyarrow < 13",
+    "pyarrow < 14",
     "rich < 14",
     "s3fs",
 ]
diff --git a/tests/convert/test_convert_cli.py b/tests/convert/test_convert_cli.py
index b00f20e2..a55f1f20 100644
--- a/tests/convert/test_convert_cli.py
+++ b/tests/convert/test_convert_cli.py
@@ -29,8 +29,9 @@ def setUp(self):
 
     def prepare_original_dir(self) -> str:
         """Returns the job timestamp used, for easier inspection"""
-        # Fill in original dir
+        # Fill in original dir, including a non-default output folder
         shutil.copytree(f"{self.datadir}/simple/output", self.original_path)
+        shutil.copytree(f"{self.datadir}/covid/term-exists", self.original_path, dirs_exist_ok=True)
         os.makedirs(f"{self.original_path}/ignored")  # just to confirm we only copy what we understand
 
         job_timestamp = "2023-02-28__19.53.08"
@@ -73,6 +74,7 @@ async def test_happy_path(self):
 
         # Test first conversion results
         expected_tables = {output.get_name(t) for t in tasks.get_default_tasks() for output in t.outputs}
+        expected_tables.add("covid_symptom__nlp_results_term_exists")  # this was our non-default added table
         self.assertEqual(expected_tables | {"JobConfig"}, set(os.listdir(self.target_path)))
         self.assertEqual(
             {"test": True}, common.read_json(f"{self.target_path}/JobConfig/{job_timestamp}/job_config.json")
@@ -85,8 +87,8 @@ async def test_happy_path(self):
         conditions = utils.read_delta_lake(f"{self.target_path}/condition")  # and conditions
         self.assertEqual(2, len(conditions))
         self.assertEqual("2010-03-02", conditions[0]["recordedDate"])
-        symptoms = utils.read_delta_lake(f"{self.target_path}/covid_symptom__nlp_results")  # and covid symptoms
-        self.assertEqual(4, len(symptoms))
+        symptoms = utils.read_delta_lake(f"{self.target_path}/covid_symptom__nlp_results_term_exists")  # and covid
+        self.assertEqual(2, len(symptoms))
         self.assertEqual("for", symptoms[0]["match"]["text"])
 
         # Now make a second small, partial output folder to layer into the existing Delta Lake
@@ -123,14 +125,21 @@ async def test_batch_metadata(self, mock_write):
             f"{self.datadir}/simple/output/patient",
             f"{self.original_path}/patient",
         )
-        shutil.copytree(  # Then, one that does (from batched-output, to confirm we read each batch in turn)
-            f"{self.datadir}/simple/batched-output/covid_symptom__nlp_results",
+        shutil.copytree(  # Then, one that does
+            f"{self.datadir}/covid/output/covid_symptom__nlp_results",
             f"{self.original_path}/covid_symptom__nlp_results",
         )
-        common.write_json(  # change metadata to reference nonexistent group, to confirm we do read from this file
+        # And make a second batch, to confirm we read each meta file
+        common.write_json(
             f"{self.original_path}/covid_symptom__nlp_results/covid_symptom__nlp_results.001.meta",
+            # Reference a group that doesn't exist to prove we are reading this file and not just pooling group_fields
+            # that we see in the data.
             {"groups": ["nonexistent"]},
         )
+        common.write_json(
+            f"{self.original_path}/covid_symptom__nlp_results/covid_symptom__nlp_results.001.ndjson",
+            {"id": "D1.0", "docref_id": "D1"},
+        )
         os.makedirs(f"{self.original_path}/JobConfig")
 
         # Run conversion
@@ -138,11 +147,12 @@ async def test_batch_metadata(self, mock_write):
 
         # Test results
         self.assertEqual(3, mock_write.call_count)
-        self.assertEqual(set(), mock_write.call_args_list[0][0][0].groups)
+        self.assertEqual(set(), mock_write.call_args_list[0][0][0].groups)  # patients
         self.assertEqual(
             {
-                "f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd",
+                "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d",
+                "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e",
             },
-            mock_write.call_args_list[1][0][0].groups,
+            mock_write.call_args_list[1][0][0].groups,  # first (actual) covid batch
         )
-        self.assertEqual({"nonexistent"}, mock_write.call_args_list[2][0][0].groups)
+        self.assertEqual({"nonexistent"}, mock_write.call_args_list[2][0][0].groups)  # second (faked) covid batch
diff --git a/tests/covid_symptom/test_nlp_results.py b/tests/covid_symptom/test_nlp_results.py
index 309c7af0..2aecef57 100644
--- a/tests/covid_symptom/test_nlp_results.py
+++ b/tests/covid_symptom/test_nlp_results.py
@@ -10,7 +10,7 @@
 
 from tests.ctakesmock import CtakesMixin
 from tests import i2b2_mock_data
-from tests.etl.test_tasks import TaskTestCase
+from tests.etl import BaseEtlSimple, TaskTestCase
 
 
 @ddt.ddt
@@ -186,3 +186,18 @@ async def test_group_values_noted(self):
             self.codebook.db.resource_hash("zero-symptoms"),  # even without rows, it shows up in group list
         }
         self.assertEqual(expected_groups, second_batch.groups)
+
+
+class TestCovidSymptomEtl(BaseEtlSimple):
+    """Tests the end-to-end ETL of covid symptom tasks."""
+
+    DATA_ROOT = "covid"
+
+    async def test_basic_run(self):
+        await self.run_etl(tags=["covid_symptom"])
+        self.assert_output_equal()
+
+    async def test_term_exists_task(self):
+        # This one isn't even tagged for the study - we only want this upon request
+        await self.run_etl(tasks=["covid_symptom__nlp_results_term_exists"])
+        self.assert_output_equal("term-exists")
diff --git a/tests/ctakesmock.py b/tests/ctakesmock.py
index e830a60b..b68cdbb5 100644
--- a/tests/ctakesmock.py
+++ b/tests/ctakesmock.py
@@ -12,6 +12,7 @@
 from unittest import mock
 
 from ctakesclient import typesystem
+from ctakesclient.transformer import TransformerModel
 
 
 class CtakesMixin(unittest.TestCase):
@@ -233,7 +234,15 @@ def fake_ctakes_extract(sentence: str) -> typesystem.CtakesJSON:
     return typesystem.CtakesJSON(response)
 
 
-async def fake_transformer_list_polarity(sentence: str, spans: list[tuple], client=None) -> list[typesystem.Polarity]:
+async def fake_transformer_list_polarity(
+    sentence: str, spans: list[tuple], client=None, model=TransformerModel.NEGATION
+) -> list[typesystem.Polarity]:
     """Simple always-positive fake response from cNLP."""
     del sentence, client
-    return [typesystem.Polarity.pos] * len(spans)
+
+    # To better detect which model is in use, ensure a small difference between them
+    if model == TransformerModel.TERM_EXISTS:
+        # First span is negative
+        return [typesystem.Polarity.neg] + [typesystem.Polarity.pos] * (len(spans) - 1)
+    else:
+        return [typesystem.Polarity.pos] * len(spans)
diff --git a/tests/data/covid/codebook.json b/tests/data/covid/codebook.json
new file mode 100644
index 00000000..92b91a01
--- /dev/null
+++ b/tests/data/covid/codebook.json
@@ -0,0 +1 @@
+{"version": 1, "id_salt": "4688a4853dafc6a3d6934f0dd02205be0700d2ca64b636127a4436494dcaf88e"}
\ No newline at end of file
diff --git a/tests/data/covid/input/DocumentReference.ndjson b/tests/data/covid/input/DocumentReference.ndjson
new file mode 100644
index 00000000..2fbbb29a
--- /dev/null
+++ b/tests/data/covid/input/DocumentReference.ndjson
@@ -0,0 +1,2 @@
+{"id":"43","content":[{"attachment":{"contentType":"text\/plain","data":"Tm90ZXMgZm9yIGZldmVy"}}],"context":{"encounter":[{"reference":"Encounter\/23"}],"period":{"end":"2021-06-24","start":"2021-06-23"}},"status":"current","subject":{"reference":"Patient\/334567"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http://cumulus.smarthealthit.org/i2b2"}]},"resourceType":"DocumentReference"}
+{"id":"44","content":[{"attachment":{"contentType":"text\/plain","data":"Tm90ZXMhIGZvciBmZXZlcg=="}}],"context":{"encounter":[{"reference":"Encounter\/25"}],"period":{"end":"2021-06-25","start":"2021-06-24"}},"status":"current","subject":{"reference":"Patient\/323456"},"type":{"coding":[{"code":"NOTE:149798455","display":"Admission MD","system":"http://cumulus.smarthealthit.org/i2b2"}]},"resourceType":"DocumentReference"}
diff --git a/tests/data/covid/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta b/tests/data/covid/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta
new file mode 100644
index 00000000..7d1797dd
--- /dev/null
+++ b/tests/data/covid/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta
@@ -0,0 +1,6 @@
+{
+  "groups": [
+    "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d",
+    "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e"
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/covid/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson b/tests/data/covid/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson
new file mode 100644
index 00000000..e2f5598b
--- /dev/null
+++ b/tests/data/covid/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson
@@ -0,0 +1,4 @@
+{"id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d.0", "docref_id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_id": "b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_id": "00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 6, "end": 9, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "386661006", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}, {"code": "50177009", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
+{"id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d.1", "docref_id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_id": "b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_id": "00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 6, "end": 9, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
+{"id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e.0", "docref_id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e", "encounter_id": "58a65c6cc5693a507af44f25f062171898aa6bc469766956b2c802d39fc6d4a7", "subject_id": "84cc1e7381070fda74a80df28a29323101be3b2c26b4d604abf43946ab1759f6", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 7, "end": 10, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "386661006", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}, {"code": "50177009", "cui": "C0015967", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
+{"id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e.1", "docref_id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e", "encounter_id": "58a65c6cc5693a507af44f25f062171898aa6bc469766956b2c802d39fc6d4a7", "subject_id": "84cc1e7381070fda74a80df28a29323101be3b2c26b4d604abf43946ab1759f6", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 3, "match": {"begin": 7, "end": 10, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
diff --git a/tests/data/covid/term-exists/covid_symptom__nlp_results_term_exists/covid_symptom__nlp_results_term_exists.000.meta b/tests/data/covid/term-exists/covid_symptom__nlp_results_term_exists/covid_symptom__nlp_results_term_exists.000.meta
new file mode 100644
index 00000000..7d1797dd
--- /dev/null
+++ b/tests/data/covid/term-exists/covid_symptom__nlp_results_term_exists/covid_symptom__nlp_results_term_exists.000.meta
@@ -0,0 +1,6 @@
+{
+  "groups": [
+    "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d",
+    "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e"
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/covid/term-exists/covid_symptom__nlp_results_term_exists/covid_symptom__nlp_results_term_exists.000.ndjson b/tests/data/covid/term-exists/covid_symptom__nlp_results_term_exists/covid_symptom__nlp_results_term_exists.000.ndjson
new file mode 100644
index 00000000..08e63d32
--- /dev/null
+++ b/tests/data/covid/term-exists/covid_symptom__nlp_results_term_exists/covid_symptom__nlp_results_term_exists.000.ndjson
@@ -0,0 +1,2 @@
+{"id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d.1", "docref_id": "c31a3dbf188ed241b2c06b2475cd56159017fa1df1ea882d3fc4beab860fc24d", "encounter_id": "b3d0707624491d8b71a808bd20b63625981af48f526b95214146de2a15f7dd43", "subject_id": "00680c7c0e2e1712e9c4a01eb5c6dfb8949871faef6337c5db204d19e1d9ca58", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 1, "match": {"begin": 6, "end": 9, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
+{"id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e.1", "docref_id": "eb30741bbb9395fc3da72d02fd29b96e2e4c0c2592c3ae997d80bf522c80070e", "encounter_id": "58a65c6cc5693a507af44f25f062171898aa6bc469766956b2c802d39fc6d4a7", "subject_id": "84cc1e7381070fda74a80df28a29323101be3b2c26b4d604abf43946ab1759f6", "generated_on": "2021-09-14T21:23:45+00:00", "task_version": 1, "match": {"begin": 7, "end": 10, "text": "for", "polarity": 0, "conceptAttributes": [{"code": "422587007", "cui": "C0027497", "codingScheme": "SNOMEDCT_US", "tui": "T184"}], "type": "SignSymptomMention"}}
diff --git a/tests/data/i2b2/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta b/tests/data/i2b2/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta
deleted file mode 100644
index 7c452c8e..00000000
--- a/tests/data/i2b2/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "groups": [
-    "228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0",
-    "dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588"
-  ]
-}
diff --git a/tests/data/i2b2/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson b/tests/data/i2b2/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson
deleted file mode 100644
index 289d2a0c..00000000
--- a/tests/data/i2b2/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson
+++ /dev/null
@@ -1,4 +0,0 @@
-{"id":"228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0.0","docref_id":"228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0","encounter_id":"5388b42b262276bfbcb659b1ff937b0e3e5b0ec8901ed3ad53fa387fd6f2589f","subject_id":"26f4d6d38eaa3347b8bd22bb4bc66ecbff5384926152738d282e841a247bfefb","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":6,"end":9,"text":"for","polarity":0,"conceptAttributes":[{"code":"386661006","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"},{"code":"50177009","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
-{"id":"228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0.1","docref_id":"228b982ddae20b8da26a212666995acde914b941a4ff7c314adf89d02c3831f0","encounter_id":"5388b42b262276bfbcb659b1ff937b0e3e5b0ec8901ed3ad53fa387fd6f2589f","subject_id":"26f4d6d38eaa3347b8bd22bb4bc66ecbff5384926152738d282e841a247bfefb","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":6,"end":9,"text":"for","polarity":0,"conceptAttributes":[{"code":"422587007","cui":"C0027497","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
-{"id":"dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588.0","docref_id":"dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588","encounter_id":"fb29ea2a68ca2e1e4bbe22bdeedf021d94ec89f7e3d38ecbe908a8f2b3d89687","subject_id":"49fbb06b4b49eb49a096cf2a96674fb84a4d52ee74ec25c8f6f26023cb4764a7","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":7,"end":10,"text":"for","polarity":0,"conceptAttributes":[{"code":"386661006","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"},{"code":"50177009","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
-{"id":"dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588.1","docref_id":"dfc45702900136d5fb09b8737853f5c727132882bd6ba0871942685c0b1df588","encounter_id":"fb29ea2a68ca2e1e4bbe22bdeedf021d94ec89f7e3d38ecbe908a8f2b3d89687","subject_id":"49fbb06b4b49eb49a096cf2a96674fb84a4d52ee74ec25c8f6f26023cb4764a7","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":7,"end":10,"text":"for","polarity":0,"conceptAttributes":[{"code":"422587007","cui":"C0027497","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
diff --git a/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta b/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta
deleted file mode 100644
index beafb7b5..00000000
--- a/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "groups": [
-    "f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd"
-  ]
-}
diff --git a/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson b/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson
deleted file mode 100644
index ce565cd8..00000000
--- a/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson
+++ /dev/null
@@ -1,2 +0,0 @@
-{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd.0","docref_id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","encounter_id":"d30aad4b-4503-8e22-0bc4-621b94398520","subject_id":"118dc10e-7745-20d7-e98d-7c358a84c15c","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":6,"end":9,"text":"for","polarity":0,"conceptAttributes":[{"code":"386661006","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"},{"code":"50177009","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
-{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd.1","docref_id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","encounter_id":"d30aad4b-4503-8e22-0bc4-621b94398520","subject_id":"118dc10e-7745-20d7-e98d-7c358a84c15c","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":6,"end":9,"text":"for","polarity":0,"conceptAttributes":[{"code":"422587007","cui":"C0027497","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
diff --git a/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.001.meta b/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.001.meta
deleted file mode 100644
index 7a55e17a..00000000
--- a/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.001.meta
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "groups": [
-    "c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971"
-  ]
-}
diff --git a/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.001.ndjson b/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.001.ndjson
deleted file mode 100644
index 3b014925..00000000
--- a/tests/data/simple/batched-output/covid_symptom__nlp_results/covid_symptom__nlp_results.001.ndjson
+++ /dev/null
@@ -1,2 +0,0 @@
-{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971.0","docref_id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","encounter_id":"af1e6186-3f9a-1fa9-3c73-cfa56c84a056","subject_id":"1de9ea66-70d3-da1f-c735-df5ef7697fb9","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":7,"end":10,"text":"for","polarity":0,"conceptAttributes":[{"code":"386661006","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"},{"code":"50177009","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
-{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971.1","docref_id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","encounter_id":"af1e6186-3f9a-1fa9-3c73-cfa56c84a056","subject_id":"1de9ea66-70d3-da1f-c735-df5ef7697fb9","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":7,"end":10,"text":"for","polarity":0,"conceptAttributes":[{"code":"422587007","cui":"C0027497","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
diff --git a/tests/data/simple/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta b/tests/data/simple/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta
deleted file mode 100644
index bb7a31b0..00000000
--- a/tests/data/simple/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "groups": [
-    "c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971",
-    "f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd"
-  ]
-}
diff --git a/tests/data/simple/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson b/tests/data/simple/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson
deleted file mode 100644
index 0d9514a0..00000000
--- a/tests/data/simple/output/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson
+++ /dev/null
@@ -1,4 +0,0 @@
-{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd.0","docref_id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","encounter_id":"d30aad4b-4503-8e22-0bc4-621b94398520","subject_id":"118dc10e-7745-20d7-e98d-7c358a84c15c","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":6,"end":9,"text":"for","polarity":0,"conceptAttributes":[{"code":"386661006","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"},{"code":"50177009","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
-{"id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd.1","docref_id":"f29736c29af5b962b3947fd40bed6b8c3e97c642b72aaa08e082fec05148e7dd","encounter_id":"d30aad4b-4503-8e22-0bc4-621b94398520","subject_id":"118dc10e-7745-20d7-e98d-7c358a84c15c","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":6,"end":9,"text":"for","polarity":0,"conceptAttributes":[{"code":"422587007","cui":"C0027497","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
-{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971.0","docref_id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","encounter_id":"af1e6186-3f9a-1fa9-3c73-cfa56c84a056","subject_id":"1de9ea66-70d3-da1f-c735-df5ef7697fb9","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":7,"end":10,"text":"for","polarity":0,"conceptAttributes":[{"code":"386661006","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"},{"code":"50177009","cui":"C0015967","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
-{"id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971.1","docref_id":"c601849ceffe49dba22ee952533ac87928cd7a472dee6d0390d53c9130519971","encounter_id":"af1e6186-3f9a-1fa9-3c73-cfa56c84a056","subject_id":"1de9ea66-70d3-da1f-c735-df5ef7697fb9","generated_on":"2021-09-14T21:23:45+00:00","task_version":2,"match":{"begin":7,"end":10,"text":"for","polarity":0,"conceptAttributes":[{"code":"422587007","cui":"C0027497","codingScheme":"SNOMEDCT_US","tui":"T184"}],"type":"SignSymptomMention"}}
diff --git a/tests/etl/__init__.py b/tests/etl/__init__.py
index e69de29b..e75f0d26 100644
--- a/tests/etl/__init__.py
+++ b/tests/etl/__init__.py
@@ -0,0 +1,3 @@
+"""Support code for ETL-based test cases."""
+
+from .base import BaseEtlSimple, TaskTestCase
diff --git a/tests/etl/base.py b/tests/etl/base.py
new file mode 100644
index 00000000..71171f88
--- /dev/null
+++ b/tests/etl/base.py
@@ -0,0 +1,154 @@
+"""Base classes for ETL-oriented tests"""
+
+import os
+import shutil
+import tempfile
+from unittest import mock
+
+import pytest
+
+from cumulus_etl import cli, common, deid, fhir
+from cumulus_etl.etl.config import JobConfig
+from tests import ctakesmock, utils
+
+
+@pytest.mark.skipif(not shutil.which(deid.MSTOOL_CMD), reason="MS tool not installed")
+class BaseEtlSimple(ctakesmock.CtakesMixin, utils.TreeCompareMixin, utils.AsyncTestCase):
+    """
+    Base test case for basic runs of etl methods
+
+    Subclasses may want to override self.input_path to point at their own input data.
+
+    Don't put actual tests in here, but rather in subclasses below.
+    """
+
+    # Subclasses may want to override this with a folder that has input/, output/, and a codebook.json
+    DATA_ROOT = "simple"
+
+    def setUp(self):
+        super().setUp()
+
+        self.root_path = os.path.join(self.datadir, self.DATA_ROOT)
+        self.input_path = os.path.join(self.root_path, "input")
+
+        tmpdir = tempfile.mkdtemp()
+        # Comment out this next line when debugging, to persist directory
+        self.addCleanup(shutil.rmtree, tmpdir)
+
+        self.output_path = os.path.join(tmpdir, "output")
+        self.phi_path = os.path.join(tmpdir, "phi")
+
+        self.enforce_consistent_uuids()
+
+    async def run_etl(
+        self,
+        input_path=None,
+        output_path=None,
+        phi_path=None,
+        output_format: str | None = "ndjson",
+        comment=None,
+        batch_size=None,
+        tasks=None,
+        tags: list[str] = None,
+        philter=True,
+        errors_to=None,
+        export_to: str = None,
+        input_format: str = "ndjson",
+    ) -> None:
+        args = [
+            input_path or self.input_path,
+            output_path or self.output_path,
+            phi_path or self.phi_path,
+            "--skip-init-checks",
+            f"--input-format={input_format}",
+            f"--ctakes-overrides={self.ctakes_overrides.name}",
+        ]
+        if output_format:
+            args.append(f"--output-format={output_format}")
+        if comment:
+            args.append(f"--comment={comment}")
+        if batch_size:
+            args.append(f"--batch-size={batch_size}")
+        if tasks:
+            args.append(f'--task={",".join(tasks)}')
+        if tags:
+            args.append(f'--task-filter={",".join(tags)}')
+        if philter:
+            args.append("--philter")
+        if export_to:
+            args.append(f"--export-to={export_to}")
+        if errors_to:
+            args.append(f"--errors-to={errors_to}")
+        await cli.main(args)
+
+    def enforce_consistent_uuids(self):
+        """Make sure that UUIDs will be the same from run to run"""
+        # First, copy codebook over. This will help ensure that the order of
+        # calls doesn't matter as much. If *every* UUID were recorded in the
+        # codebook, this is all we'd need to do.
+        os.makedirs(self.phi_path)
+        shutil.copy(os.path.join(self.root_path, "codebook.json"), self.phi_path)
+
+    def assert_output_equal(self, folder: str = "output"):
+        """Compares the etl output with the expected json structure"""
+        self.assert_etl_output_equal(os.path.join(self.root_path, folder), self.output_path)
+
+
+class TaskTestCase(utils.AsyncTestCase):
+    """Base class for task-focused test suites"""
+
+    def setUp(self) -> None:
+        super().setUp()
+
+        client = fhir.FhirClient("http://localhost/", [])
+        self.tmpdir = tempfile.TemporaryDirectory()  # pylint: disable=consider-using-with
+        self.input_dir = os.path.join(self.tmpdir.name, "input")
+        self.phi_dir = os.path.join(self.tmpdir.name, "phi")
+        self.errors_dir = os.path.join(self.tmpdir.name, "errors")
+        os.makedirs(self.input_dir)
+        os.makedirs(self.phi_dir)
+
+        self.job_config = JobConfig(
+            self.input_dir,
+            self.input_dir,
+            self.tmpdir.name,
+            self.phi_dir,
+            "ndjson",
+            "ndjson",
+            client,
+            batch_size=5,
+            dir_errors=self.errors_dir,
+        )
+
+        def make_formatter(dbname: str, group_field: str = None, resource_type: str = None):
+            formatter = mock.MagicMock(dbname=dbname, group_field=group_field, resource_type=resource_type)
+            self.format_count += 1
+            if self.format_count == 1:
+                self.format = self.format or formatter
+                return self.format
+            elif self.format_count == 2:
+                self.format2 = self.format2 or formatter
+                return self.format2
+            else:
+                return formatter  # stop keeping track
+
+        self.format = None
+        self.format2 = None  # for tasks that have multiple output streams
+        self.format_count = 0
+        self.create_formatter_mock = mock.MagicMock(side_effect=make_formatter)
+        self.job_config.create_formatter = self.create_formatter_mock
+
+        self.scrubber = deid.Scrubber()
+        self.codebook = self.scrubber.codebook
+
+        # Keeps consistent IDs
+        shutil.copy(os.path.join(self.datadir, "simple/codebook.json"), self.phi_dir)
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        self.tmpdir = None
+
+    def make_json(self, filename, resource_id, **kwargs):
+        common.write_json(
+            os.path.join(self.input_dir, f"{filename}.ndjson"), {"resourceType": "Test", **kwargs, "id": resource_id}
+        )
diff --git a/tests/etl/test_etl_cli.py b/tests/etl/test_etl_cli.py
index ef601330..8946461e 100644
--- a/tests/etl/test_etl_cli.py
+++ b/tests/etl/test_etl_cli.py
@@ -4,89 +4,18 @@
 import json
 import os
 import shutil
-import tempfile
 from unittest import mock
 
 import ddt
-import pytest
 from ctakesclient.typesystem import Polarity
 
-from cumulus_etl import cli, common, deid, errors, loaders, store
+from cumulus_etl import common, errors, loaders, store
 from cumulus_etl.etl import context
 
-from tests.ctakesmock import CtakesMixin, fake_ctakes_extract
+from tests.ctakesmock import fake_ctakes_extract
+from tests.etl import BaseEtlSimple
 from tests.s3mock import S3Mixin
-from tests.utils import FROZEN_TIME_UTC, AsyncTestCase, TreeCompareMixin, read_delta_lake
-
-
-@pytest.mark.skipif(not shutil.which(deid.MSTOOL_CMD), reason="MS tool not installed")
-class BaseEtlSimple(CtakesMixin, TreeCompareMixin, AsyncTestCase):
-    """
-    Base test case for basic runs of etl methods
-
-    Don't put actual tests in here, but rather in subclasses below.
-    """
-
-    def setUp(self):
-        super().setUp()
-
-        self.data_dir = os.path.join(self.datadir, "simple")
-        self.input_path = os.path.join(self.data_dir, "input")
-
-        tmpdir = tempfile.mkdtemp()
-        # Comment out this next line when debugging, to persist directory
-        self.addCleanup(shutil.rmtree, tmpdir)
-
-        self.output_path = os.path.join(tmpdir, "output")
-        self.phi_path = os.path.join(tmpdir, "phi")
-
-        self.enforce_consistent_uuids()
-
-    async def run_etl(
-        self,
-        input_path=None,
-        output_path=None,
-        phi_path=None,
-        output_format: str | None = "ndjson",
-        comment=None,
-        batch_size=None,
-        tasks=None,
-        philter=True,
-        errors_to=None,
-    ) -> None:
-        args = [
-            input_path or self.input_path,
-            output_path or self.output_path,
-            phi_path or self.phi_path,
-            "--skip-init-checks",
-            "--input-format=ndjson",
-            f"--ctakes-overrides={self.ctakes_overrides.name}",
-        ]
-        if output_format:
-            args.append(f"--output-format={output_format}")
-        if comment:
-            args.append(f"--comment={comment}")
-        if batch_size:
-            args.append(f"--batch-size={batch_size}")
-        if tasks:
-            args.append(f'--task={",".join(tasks)}')
-        if philter:
-            args.append("--philter")
-        if errors_to:
-            args.append(f"--errors-to={errors_to}")
-        await cli.main(args)
-
-    def enforce_consistent_uuids(self):
-        """Make sure that UUIDs will be the same from run to run"""
-        # First, copy codebook over. This will help ensure that the order of
-        # calls doesn't matter as much. If *every* UUID were recorded in the
-        # codebook, this is all we'd need to do.
-        os.makedirs(self.phi_path)
-        shutil.copy(os.path.join(self.data_dir, "codebook.json"), self.phi_path)
-
-    def assert_output_equal(self, folder: str):
-        """Compares the etl output with the expected json structure"""
-        self.assert_etl_output_equal(os.path.join(self.data_dir, folder), self.output_path)
+from tests.utils import FROZEN_TIME_UTC, read_delta_lake
 
 
 @ddt.ddt
@@ -274,7 +203,7 @@ class TestEtlFormats(BaseEtlSimple):
 
     async def test_etl_job_ndjson(self):
         await self.run_etl()
-        self.assert_output_equal("output")
+        self.assert_output_equal()
 
     async def test_etl_job_deltalake(self):
         await self.run_etl(output_format=None)  # deltalake should be default output format
@@ -335,8 +264,6 @@ async def test_etl_job_s3(self):
                 "mockbucket/root/patient/patient.000.ndjson",
                 "mockbucket/root/procedure/procedure.000.ndjson",
                 "mockbucket/root/servicerequest/servicerequest.000.ndjson",
-                "mockbucket/root/covid_symptom__nlp_results/covid_symptom__nlp_results.000.ndjson",
-                "mockbucket/root/covid_symptom__nlp_results/covid_symptom__nlp_results.000.meta",
             },
             all_files,
         )
@@ -349,6 +276,8 @@ async def test_etl_job_s3(self):
 class TestEtlNlp(BaseEtlSimple):
     """Test case for the cTAKES/cNLP responses"""
 
+    CACHE_FOLDER = "covid_symptom_v3"
+
     def setUp(self):
         super().setUp()
         # sha256 checksums of the two test patient notes
@@ -379,13 +308,13 @@ async def test_stores_cached_json(self):
 
         for index, checksum in enumerate(self.expected_checksums):
             ner = fake_ctakes_extract(facts[index])
-            self.assertEqual(ner.as_json(), common.read_json(self.path_for_checksum("covid_symptom_v2", checksum)))
-            self.assertEqual([0, 0], common.read_json(self.path_for_checksum("covid_symptom_v2-cnlp_v2", checksum)))
+            self.assertEqual(ner.as_json(), common.read_json(self.path_for_checksum(self.CACHE_FOLDER, checksum)))
+            self.assertEqual([0, 0], common.read_json(self.path_for_checksum(f"{self.CACHE_FOLDER}-cnlp_v2", checksum)))
 
     async def test_does_not_hit_server_if_cache_exists(self):
         for index, checksum in enumerate(self.expected_checksums):
             # Write out some fake results to the cache location
-            filename = self.path_for_checksum("covid_symptom_v2", checksum)
+            filename = self.path_for_checksum(self.CACHE_FOLDER, checksum)
             os.makedirs(os.path.dirname(filename))
             common.write_json(
                 filename,
@@ -405,7 +334,7 @@ async def test_does_not_hit_server_if_cache_exists(self):
                 },
             )
 
-            cnlp_filename = self.path_for_checksum("covid_symptom_v2-cnlp_v2", checksum)
+            cnlp_filename = self.path_for_checksum(f"{self.CACHE_FOLDER}-cnlp_v2", checksum)
             os.makedirs(os.path.dirname(cnlp_filename))
             common.write_json(cnlp_filename, [0])
 
diff --git a/tests/etl/test_tasks.py b/tests/etl/test_tasks.py
index 75700f06..aed3f9a7 100644
--- a/tests/etl/test_tasks.py
+++ b/tests/etl/test_tasks.py
@@ -1,78 +1,15 @@
 """Tests for etl/tasks/"""
 
 import os
-import shutil
-import tempfile
 from unittest import mock
 
 import ddt
 import pyarrow
 
-from cumulus_etl import common, deid, errors, fhir
-from cumulus_etl.etl import config, tasks
+from cumulus_etl import common, errors
+from cumulus_etl.etl import tasks
 from cumulus_etl.etl.tasks import basic_tasks
-
-from tests.utils import AsyncTestCase
-
-
-class TaskTestCase(AsyncTestCase):
-    """Base class for task-focused test suites"""
-
-    def setUp(self) -> None:
-        super().setUp()
-
-        client = fhir.FhirClient("http://localhost/", [])
-        self.tmpdir = tempfile.TemporaryDirectory()  # pylint: disable=consider-using-with
-        self.input_dir = os.path.join(self.tmpdir.name, "input")
-        self.phi_dir = os.path.join(self.tmpdir.name, "phi")
-        self.errors_dir = os.path.join(self.tmpdir.name, "errors")
-        os.makedirs(self.input_dir)
-        os.makedirs(self.phi_dir)
-
-        self.job_config = config.JobConfig(
-            self.input_dir,
-            self.input_dir,
-            self.tmpdir.name,
-            self.phi_dir,
-            "ndjson",
-            "ndjson",
-            client,
-            batch_size=5,
-            dir_errors=self.errors_dir,
-        )
-
-        def make_formatter(dbname: str, group_field: str = None, resource_type: str = None):
-            formatter = mock.MagicMock(dbname=dbname, group_field=group_field, resource_type=resource_type)
-            self.format_count += 1
-            if self.format_count == 1:
-                self.format = self.format or formatter
-                return self.format
-            elif self.format_count == 2:
-                self.format2 = self.format2 or formatter
-                return self.format2
-            else:
-                return formatter  # stop keeping track
-
-        self.format = None
-        self.format2 = None  # for tasks that have multiple output streams
-        self.format_count = 0
-        self.create_formatter_mock = mock.MagicMock(side_effect=make_formatter)
-        self.job_config.create_formatter = self.create_formatter_mock
-
-        self.scrubber = deid.Scrubber()
-        self.codebook = self.scrubber.codebook
-
-        # Keeps consistent IDs
-        shutil.copy(os.path.join(self.datadir, "simple/codebook.json"), self.phi_dir)
-
-    def tearDown(self) -> None:
-        super().tearDown()
-        self.tmpdir = None
-
-    def make_json(self, filename, resource_id, **kwargs):
-        common.write_json(
-            os.path.join(self.input_dir, f"{filename}.ndjson"), {"resourceType": "Test", **kwargs, "id": resource_id}
-        )
+from tests.etl import TaskTestCase
 
 
 @ddt.ddt
diff --git a/tests/i2b2/test_i2b2_etl.py b/tests/i2b2/test_i2b2_etl.py
index fdd85b12..49b9710e 100644
--- a/tests/i2b2/test_i2b2_etl.py
+++ b/tests/i2b2/test_i2b2_etl.py
@@ -2,70 +2,27 @@
 
 import filecmp
 import os
-import shutil
 import tempfile
 
-import pytest
+from tests.etl import BaseEtlSimple
 
-from cumulus_etl import cli, deid
 
-from tests.ctakesmock import CtakesMixin
-from tests.utils import AsyncTestCase, TreeCompareMixin
-
-
-@pytest.mark.skipif(not shutil.which(deid.MSTOOL_CMD), reason="MS tool not installed")
-class TestI2b2Etl(CtakesMixin, TreeCompareMixin, AsyncTestCase):
+class TestI2b2Etl(BaseEtlSimple):
     """
     Base test case for basic runs of etl methods against i2b2 data
     """
 
-    def setUp(self):
-        super().setUp()
-
-        i2b2_dir = os.path.join(self.datadir, "i2b2")
-        self.input_path = os.path.join(i2b2_dir, "input")
-        self.expected_output_path = os.path.join(i2b2_dir, "output")
-        self.expected_export_path = os.path.join(i2b2_dir, "export")
-
-        tmpdir = tempfile.mkdtemp()
-        # Comment out this next line when debugging, to persist directory
-        self.addCleanup(shutil.rmtree, tmpdir)
-
-        self.output_path = os.path.join(tmpdir, "output")
-        self.phi_path = os.path.join(tmpdir, "phi")
-
-        # Copy the codebook over, to guarantee the same ID mappings run-to-run
-        os.makedirs(self.phi_path)
-        shutil.copy(os.path.join(i2b2_dir, "codebook.json"), self.phi_path)
+    DATA_ROOT = "i2b2"
 
     async def test_full_etl(self):
-        await cli.main(
-            [
-                self.input_path,
-                self.output_path,
-                self.phi_path,
-                "--skip-init-checks",
-                "--input-format=i2b2",
-                "--output-format=ndjson",
-                f"--ctakes-overrides={self.ctakes_overrides.name}",
-            ]
-        )
-        self.assert_etl_output_equal(self.expected_output_path, self.output_path)
+        await self.run_etl(input_format="i2b2", philter=False)
+        self.assert_output_equal()
 
     async def test_export(self):
         with tempfile.TemporaryDirectory() as export_path:
-            await cli.main(
-                [
-                    self.input_path,
-                    self.output_path,
-                    self.phi_path,
-                    "--skip-init-checks",
-                    "--input-format=i2b2",
-                    "--output-format=ndjson",
-                    f"--export-to={export_path}",
-                    "--task=patient",  # just to make the test faster and confirm we don't export unnecessary files
-                ]
-            )
+            # Only run patient task to make the test faster and confirm we don't export unnecessary files
+            await self.run_etl(input_format="i2b2", export_to=export_path, tasks=["patient"], philter=False)
 
-            dircmp = filecmp.dircmp(export_path, self.expected_export_path)
+            expected_export_path = os.path.join(self.datadir, self.DATA_ROOT, "export")
+            dircmp = filecmp.dircmp(export_path, expected_export_path)
             self.assert_file_tree_equal(dircmp)