diff --git a/cwltool/cuda.py b/cwltool/cuda.py
index 719bfd867..2fd884aed 100644
--- a/cwltool/cuda.py
+++ b/cwltool/cuda.py
@@ -8,23 +8,48 @@
 from .utils import CWLObjectType
 
 
+def cuda_device_count() -> str:
+    """Determine the number of attached CUDA GPUs."""
+    # For the number of GPUs, we can use the following query
+    cmd = ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"]
+    try:
+        # This is equivalent to subprocess.check_output, but use
+        # subprocess.run so we can use separate MagicMocks in test_cuda.py
+        proc = subprocess.run(cmd, stdout=subprocess.PIPE, check=True)  # nosec
+    except Exception as e:
+        _logger.warning("Error checking number of GPUs with nvidia-smi: %s", e)
+        return "0"
+    # NOTE: On a machine with N GPUs the query return N lines, each containing N.
+    return proc.stdout.decode("utf-8").split("\n")[0]
+
+
 def cuda_version_and_device_count() -> Tuple[str, int]:
     """Determine the CUDA version and number of attached CUDA GPUs."""
+    count = int(cuda_device_count())
+
+    # Since there is no specific query for the cuda version, we have to use
+    # `nvidia-smi -q -x`
+    # However, apparently nvidia-smi is not safe to call concurrently.
+    # With --parallel, sometimes the returned XML will contain
+    # <process_name>\xff...\xff</process_name>
+    # (or other arbitrary bytes) and xml.dom.minidom.parseString will raise
+    # "xml.parsers.expat.ExpatError: not well-formed (invalid token)"
+    # So we either need to use `grep -v process_name` to blacklist that tag,
+    # (and hope that no other tags cause problems in the future)
+    # or better yet use `grep cuda_version` to only grab the tags we will use.
+    cmd = "nvidia-smi -q -x | grep cuda_version"
     try:
-        out = subprocess.check_output(["nvidia-smi", "-q", "-x"])  # nosec
+        out = subprocess.check_output(cmd, shell=True)  # nosec
     except Exception as e:
         _logger.warning("Error checking CUDA version with nvidia-smi: %s", e)
         return ("", 0)
-    dm = xml.dom.minidom.parseString(out)  # nosec
 
-    ag = dm.getElementsByTagName("attached_gpus")
-    if len(ag) < 1 or ag[0].firstChild is None:
-        _logger.warning(
-            "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s",
-            out,
-        )
+    try:
+        dm = xml.dom.minidom.parseString(out)  # nosec
+    except xml.parsers.expat.ExpatError as e:
+        _logger.warning("Error parsing XML stdout of nvidia-smi: %s", e)
+        _logger.warning("stdout: %s", out)
         return ("", 0)
-    ag_element = ag[0].firstChild
 
     cv = dm.getElementsByTagName("cuda_version")
     if len(cv) < 1 or cv[0].firstChild is None:
@@ -35,13 +60,10 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
         return ("", 0)
     cv_element = cv[0].firstChild
 
-    if isinstance(cv_element, xml.dom.minidom.Text) and isinstance(
-        ag_element, xml.dom.minidom.Text
-    ):
-        return (cv_element.data, int(ag_element.data))
+    if isinstance(cv_element, xml.dom.minidom.Text):
+        return (cv_element.data, count)
     _logger.warning(
-        "Error checking CUDA version with nvidia-smi. "
-        "Either 'attached_gpus' or 'cuda_version' was not a text node: %s",
+        "Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node: %s",
         out,
     )
     return ("", 0)
diff --git a/tests/test_cuda.py b/tests/test_cuda.py
index e8de7cd63..b737538a1 100644
--- a/tests/test_cuda.py
+++ b/tests/test_cuda.py
@@ -90,8 +90,11 @@ def _makebuilder(cudaReq: CWLObjectType) -> Builder:
 
 
 @mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
 @mock.patch("os.makedirs")
-def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:
+def test_cuda_job_setup_check(
+    makedirs: MagicMock, count: MagicMock, check_output: MagicMock
+) -> None:
     runtime_context = RuntimeContext({})
 
     cudaReq: CWLObjectType = {
@@ -101,11 +104,9 @@ def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> N
     }
     builder = _makebuilder(cudaReq)
 
+    count.return_value = "1"
     check_output.return_value = """
-<nvidia>
-<attached_gpus>1</attached_gpus>
 <cuda_version>1.0</cuda_version>
-</nvidia>
 """
 
     jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
@@ -113,62 +114,33 @@ def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> N
 
 
 @mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
 @mock.patch("os.makedirs")
-def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:
-    runtime_context = RuntimeContext({})
-
-    cudaReq: CWLObjectType = {
-        "class": "http://commonwl.org/cwltool#CUDARequirement",
-        "cudaVersionMin": "2.0",
-        "cudaComputeCapability": "1.0",
-    }
-    builder = _makebuilder(cudaReq)
-
-    check_output.return_value = """
-<nvidia>
-<attached_gpus>1</attached_gpus>
-<cuda_version>1.0</cuda_version>
-</nvidia>
-"""
-    jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
-    with pytest.raises(WorkflowException):
-        jb._setup(runtime_context)
-
-
-@mock.patch("subprocess.check_output")
-@mock.patch("os.makedirs")
-def test_cuda_job_setup_check_err_empty_attached_gpus(
-    makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+def test_cuda_job_setup_check_err(
+    makedirs: MagicMock, count: MagicMock, check_output: MagicMock
 ) -> None:
     runtime_context = RuntimeContext({})
 
     cudaReq: CWLObjectType = {
         "class": "http://commonwl.org/cwltool#CUDARequirement",
-        "cudaVersionMin": "1.0",
+        "cudaVersionMin": "2.0",
         "cudaComputeCapability": "1.0",
     }
     builder = _makebuilder(cudaReq)
 
+    count.return_value = "1"
     check_output.return_value = """
-<nvidia>
-<attached_gpus></attached_gpus>
 <cuda_version>1.0</cuda_version>
-</nvidia>
 """
-
     jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
     with pytest.raises(WorkflowException):
         jb._setup(runtime_context)
-    assert (
-        "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
-        in caplog.text
-    )
 
 
-@mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
 @mock.patch("os.makedirs")
-def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
-    makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+def test_cuda_job_setup_check_err_missing_query_gpu_count(
+    makedirs: MagicMock, count: MagicMock, caplog: pytest.LogCaptureFixture
 ) -> None:
     runtime_context = RuntimeContext({})
 
@@ -179,25 +151,19 @@ def test_cuda_job_setup_check_err_empty_missing_attached_gpus(
     }
     builder = _makebuilder(cudaReq)
 
-    check_output.return_value = """
-<nvidia>
-<cuda_version>1.0</cuda_version>
-</nvidia>
-"""
+    count.return_value = ""
 
     jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
     with pytest.raises(WorkflowException):
         jb._setup(runtime_context)
-    assert (
-        "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty."
-        in caplog.text
-    )
+    assert "invalid literal for int() with base 10" in caplog.text
 
 
 @mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
 @mock.patch("os.makedirs")
 def test_cuda_job_setup_check_err_empty_cuda_version(
-    makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+    makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
 ) -> None:
     runtime_context = RuntimeContext({})
 
@@ -208,11 +174,9 @@ def test_cuda_job_setup_check_err_empty_cuda_version(
     }
     builder = _makebuilder(cudaReq)
 
+    count.return_value = "1"
     check_output.return_value = """
-<nvidia>
-<attached_gpus>1</attached_gpus>
 <cuda_version></cuda_version>
-</nvidia>
 """
 
     jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
@@ -225,9 +189,10 @@ def test_cuda_job_setup_check_err_empty_cuda_version(
 
 
 @mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
 @mock.patch("os.makedirs")
 def test_cuda_job_setup_check_err_missing_cuda_version(
-    makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+    makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
 ) -> None:
     runtime_context = RuntimeContext({})
 
@@ -238,25 +203,20 @@ def test_cuda_job_setup_check_err_missing_cuda_version(
     }
     builder = _makebuilder(cudaReq)
 
-    check_output.return_value = """
-<nvidia>
-<attached_gpus>1</attached_gpus>
-</nvidia>
-"""
+    count.return_value = "1"
+    check_output.return_value = ""
 
     jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
     with pytest.raises(WorkflowException):
         jb._setup(runtime_context)
-    assert (
-        "Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty."
-        in caplog.text
-    )
+    assert "Error parsing XML stdout of nvidia-smi" in caplog.text
 
 
 @mock.patch("subprocess.check_output")
+@mock.patch("cwltool.cuda.cuda_device_count")
 @mock.patch("os.makedirs")
 def test_cuda_job_setup_check_err_wrong_type_cuda_version(
-    makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
+    makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture
 ) -> None:
     runtime_context = RuntimeContext({})
 
@@ -267,19 +227,17 @@ def test_cuda_job_setup_check_err_wrong_type_cuda_version(
     }
     builder = _makebuilder(cudaReq)
 
+    count.return_value = "1"
     check_output.return_value = """
-<nvidia>
-<attached_gpus>1</attached_gpus>
 <cuda_version><subelement /></cuda_version>
-</nvidia>
 """
 
     jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "")
     with pytest.raises(WorkflowException):
         jb._setup(runtime_context)
     assert (
-        "Error checking CUDA version with nvidia-smi. "
-        "Either 'attached_gpus' or 'cuda_version' was not a text node" in caplog.text
+        "Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node"
+        in caplog.text
     )
 
 
diff --git a/tests/wf/nvidia-smi-container.cwl b/tests/wf/nvidia-smi-container.cwl
index 84fd72d83..40d6fba98 100644
--- a/tests/wf/nvidia-smi-container.cwl
+++ b/tests/wf/nvidia-smi-container.cwl
@@ -7,7 +7,7 @@ requirements:
     cudaVersionMin: "1.0"
     cudaComputeCapability: "1.0"
   DockerRequirement:
-    dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04"
+    dockerPull: "nvidia/cuda:11.4.3-runtime-ubuntu20.04"
 inputs: []
 outputs: []
 # Assume this will exit non-zero (resulting in a failing test case) if