From ddf5d4fd7e29d36fe7dcb133fef79fbd066d2a81 Mon Sep 17 00:00:00 2001
From: Michael Terry <michael.terry@childrens.harvard.edu>
Date: Fri, 5 Jul 2024 14:50:25 -0400
Subject: [PATCH] tests: add some CLI tests to improve coverage

---
 .github/workflows/ci.yaml             |  2 +-
 cumulus_etl/cli.py                    |  4 +--
 tests/etl/base.py                     |  4 ++-
 tests/etl/test_etl_cli.py             | 14 ++++++++
 tests/test_cli.py                     | 29 ++++++++++++++--
 tests/upload_notes/test_upload_cli.py | 49 +++++++++++++++++++++++++--
 6 files changed, 93 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 81645751..8b5a186a 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -78,7 +78,7 @@ jobs:
         uses: docker/setup-buildx-action@v3
 
       - name: Build ETL image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           load: true  # just build, no push
           tags: smartonfhir/cumulus-etl:latest
diff --git a/cumulus_etl/cli.py b/cumulus_etl/cli.py
index 865e6dd5..c5a1654b 100644
--- a/cumulus_etl/cli.py
+++ b/cumulus_etl/cli.py
@@ -80,8 +80,8 @@ async def main(argv: list[str]) -> None:
 
 
 def main_cli():
-    asyncio.run(main(sys.argv[1:]))
+    asyncio.run(main(sys.argv[1:]))  # pragma: no cover
 
 
 if __name__ == "__main__":
-    main_cli()
+    main_cli()  # pragma: no cover
diff --git a/tests/etl/base.py b/tests/etl/base.py
index 85647ac2..c19addac 100644
--- a/tests/etl/base.py
+++ b/tests/etl/base.py
@@ -58,15 +58,17 @@ async def run_etl(
         export_group: str = "test-group",
         export_timestamp: str = "2020-10-13T12:00:20-05:00",
         write_completion: bool = True,
+        skip_init_checks: bool = True,
     ) -> None:
         args = [
             input_path or self.input_path,
             output_path or self.output_path,
             phi_path or self.phi_path,
-            "--skip-init-checks",
             f"--input-format={input_format}",
             f"--ctakes-overrides={self.ctakes_overrides.name}",
         ]
+        if skip_init_checks:
+            args.append("--skip-init-checks")
         if export_group is not None:
             args.append(f"--export-group={export_group}")
         if export_timestamp:
diff --git a/tests/etl/test_etl_cli.py b/tests/etl/test_etl_cli.py
index 7927e153..da5149cf 100644
--- a/tests/etl/test_etl_cli.py
+++ b/tests/etl/test_etl_cli.py
@@ -174,6 +174,20 @@ async def test_bulk_no_auth(self):
             await self.run_etl(input_path="https://localhost:12345/", tasks=["patient"])
         self.assertEqual(errors.BULK_EXPORT_FAILED, cm.exception.code)
 
+    async def test_no_ms_tool(self):
+        """Verify that we require the MS tool to be in PATH."""
+        self.patch_dict(os.environ, {"PATH": "/nothing-here"})
+        with self.assertRaises(SystemExit) as cm:
+            await self.run_etl(skip_init_checks=False)
+        self.assertEqual(cm.exception.code, errors.MSTOOL_MISSING)
+
+    @mock.patch("cumulus_etl.etl.tasks.basic_tasks.ProcedureTask.init_check")
+    async def test_task_init_checks(self, mock_check):
+        """Verify that we check task requirements."""
+        mock_check.side_effect = ZeroDivisionError
+        with self.assertRaises(ZeroDivisionError):
+            await self.run_etl(skip_init_checks=False)
+
     @ddt.data(
         # First line is CLI args
         # Second line is what loader will represent as the group/time
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 90c9b08f..b6a41f9b 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -1,12 +1,13 @@
-"""Tests for cli.py"""
+"""Tests for cli.py and cli_utils.py"""
 
 import contextlib
 import io
+import socket
 from unittest import mock
 
 import ddt
 
-from cumulus_etl import cli
+from cumulus_etl import cli, cli_utils
 from tests.utils import AsyncTestCase
 
 
@@ -47,3 +48,27 @@ async def test_routing(self, argv, main_method):
             await cli.main(argv)
 
         self.assertTrue(mock_main.called)
+
+
+class TestCumulusCLIUtils(AsyncTestCase):
+    """
+    Unit tests for our CLI helpers.
+    """
+
+    async def test_url_is_unresolvable(self):
+        """Verify that a hostname that doesn't exist is immediately failed"""
+        # First test the plumbing is real and work
+        self.assertFalse(cli_utils.is_url_available("http://nope.invalid"))
+
+        # Now mock the plumbing to confirm we don't retry
+        with mock.patch("socket.create_connection") as mock_conn:
+            mock_conn.side_effect = socket.gaierror
+            self.assertFalse(cli_utils.is_url_available("http://nope.invalid"))
+            self.assertEqual(mock_conn.call_count, 1)
+
+    @mock.patch("socket.create_connection", side_effect=ConnectionRefusedError)
+    @mock.patch("time.sleep", new=lambda x: None)  # don't sleep during retries
+    async def test_url_is_not_ready(self, mock_conn):
+        """Verify that a refused connection is retried"""
+        self.assertFalse(cli_utils.is_url_available("http://nope.invalid"))
+        self.assertEqual(mock_conn.call_count, 6)
diff --git a/tests/upload_notes/test_upload_cli.py b/tests/upload_notes/test_upload_cli.py
index 1ec949c1..e638e3d8 100644
--- a/tests/upload_notes/test_upload_cli.py
+++ b/tests/upload_notes/test_upload_cli.py
@@ -81,6 +81,7 @@ async def run_upload_notes(
         philter=None,
         no_philter=None,
         overwrite=False,
+        skip_init_checks=True,
     ) -> None:
         args = [
             "upload-notes",
@@ -92,8 +93,9 @@ async def run_upload_notes(
             f"--export-to={self.export_path}",
             "--ls-project=21",
             f"--ls-token={self.token_path}",
-            "--skip-init-checks",
         ]
+        if skip_init_checks:
+            args += ["--skip-init-checks"]
         if anon_docrefs:
             args += ["--anon-docrefs", anon_docrefs]
         if docrefs:
@@ -160,8 +162,10 @@ def mock_search_url(respx_mock: respx.MockRouter, patient: str, doc_ids: Iterabl
         respx_mock.get(f"https://localhost/DocumentReference?patient={patient}&_elements=content").respond(json=bundle)
 
     @staticmethod
-    def mock_read_url(respx_mock: respx.MockRouter, doc_id: str, code: int = 200, **kwargs) -> None:
-        docref = TestUploadNotes.make_docref(doc_id, **kwargs)
+    def mock_read_url(
+        respx_mock: respx.MockRouter, doc_id: str, code: int = 200, docref: dict = None, **kwargs
+    ) -> None:
+        docref = docref or TestUploadNotes.make_docref(doc_id, **kwargs)
         respx_mock.get(f"https://localhost/DocumentReference/{doc_id}").respond(status_code=code, json=docref)
 
     @staticmethod
@@ -451,3 +455,42 @@ async def test_grouped_encounter_offsets(self):
         self.assertEqual("doc?", note.text[match2c[0] : match2c[1]])
         spans = {x.span().key() for x in note.ctakes_matches}
         self.assertEqual({match1a, match1b, match1c, match2a, match2b, match2c}, spans)
+
+    async def test_docrefs_without_encounters_are_skipped(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with common.NdjsonWriter(f"{tmpdir}/DocumentReference.ndjson") as writer:
+                # One docref without any context/encounter info
+                docref1 = self.make_docref("D1")
+                del docref1["context"]
+                writer.write(docref1)
+
+                # And one with all the normal goodies
+                writer.write(self.make_docref("D2"))
+
+            await self.run_upload_notes(input_path=tmpdir)
+
+        self.assertEqual({"D1", "D2"}, self.get_exported_ids())
+        self.assertEqual({"D2"}, self.get_pushed_ids())
+
+    @mock.patch("cumulus_etl.nlp.restart_ctakes_with_bsv", new=lambda *_: False)
+    async def test_nlp_restart_failure_is_noticed(self):
+        with self.assertRaises(SystemExit) as cm:
+            await self.run_upload_notes()
+        self.assertEqual(cm.exception.code, errors.CTAKES_OVERRIDES_INVALID)
+
+    @mock.patch("cumulus_etl.nlp.check_ctakes")
+    @mock.patch("cumulus_etl.nlp.check_negation_cnlpt")
+    @mock.patch("cumulus_etl.cli_utils.is_url_available")
+    async def test_init_checks(self, mock_url, mock_cnlpt, mock_ctakes):
+        # Start with error case for our URL check (against label studio)
+        mock_url.return_value = False
+        with self.assertRaises(SystemExit) as cm:
+            await self.run_upload_notes(skip_init_checks=False)
+        self.assertEqual(mock_ctakes.call_count, 1)
+        self.assertEqual(mock_cnlpt.call_count, 1)
+        self.assertEqual(mock_url.call_count, 1)
+        self.assertEqual(cm.exception.code, errors.LABEL_STUDIO_MISSING)
+
+        # Let the URL pass and confirm we now run successfully
+        mock_url.return_value = True
+        await self.run_upload_notes(skip_init_checks=False)