From d3f89d3d90454aafb95c6bafeddcc1e4292ba2b2 Mon Sep 17 00:00:00 2001 From: Jake Fennick Date: Tue, 19 Sep 2023 11:23:24 -0600 Subject: [PATCH 1/2] Filter out malformed nvidia-smi process_name XML tag --- cwltool/cuda.py | 51 +++++++++++----- tests/test_cuda.py | 98 +++++++++---------------------- tests/wf/nvidia-smi-container.cwl | 2 +- 3 files changed, 65 insertions(+), 86 deletions(-) diff --git a/cwltool/cuda.py b/cwltool/cuda.py index 719bfd867..7217434e6 100644 --- a/cwltool/cuda.py +++ b/cwltool/cuda.py @@ -8,23 +8,47 @@ from .utils import CWLObjectType +def cuda_device_count() -> str: + """Determine the number of attached CUDA GPUs.""" + # For the number of GPUs, we can use the following query + cmd = ["nvidia-smi", "--query-gpu=count", "--format=csv,noheader"] + try: + # This is equivalent to subprocess.check_output, but use + # subprocess.run so we can use separate MagicMocks in test_cuda.py + proc = subprocess.run(cmd, stdout=subprocess.PIPE, check=True) # nosec + except Exception as e: + _logger.warning("Error checking number of GPUs with nvidia-smi: %s", e) + return "0" + return proc.stdout.decode("utf-8") + + def cuda_version_and_device_count() -> Tuple[str, int]: """Determine the CUDA version and number of attached CUDA GPUs.""" + count = int(cuda_device_count()) + + # Since there is no specific query for the cuda version, we have to use + # `nvidia-smi -q -x` + # However, apparently nvidia-smi is not safe to call concurrently. + # With --parallel, sometimes the returned XML will contain + # \xff...\xff + # (or other arbitrary bytes) and xml.dom.minidom.parseString will raise + # "xml.parsers.expat.ExpatError: not well-formed (invalid token)" + # So we either need to use `grep -v process_name` to blacklist that tag, + # (and hope that no other tags cause problems in the future) + # or better yet use `grep cuda_version` to only grab the tags we will use. + cmd = "nvidia-smi -q -x | grep cuda_version" try: - out = subprocess.check_output(["nvidia-smi", "-q", "-x"]) # nosec + out = subprocess.check_output(cmd, shell=True) # nosec except Exception as e: _logger.warning("Error checking CUDA version with nvidia-smi: %s", e) return ("", 0) - dm = xml.dom.minidom.parseString(out) # nosec - ag = dm.getElementsByTagName("attached_gpus") - if len(ag) < 1 or ag[0].firstChild is None: - _logger.warning( - "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty.: %s", - out, - ) + try: + dm = xml.dom.minidom.parseString(out) # nosec + except xml.parsers.expat.ExpatError as e: + _logger.warning("Error parsing XML stdout of nvidia-smi: %s", e) + _logger.warning("stdout: %s", out) return ("", 0) - ag_element = ag[0].firstChild cv = dm.getElementsByTagName("cuda_version") if len(cv) < 1 or cv[0].firstChild is None: @@ -35,13 +59,10 @@ def cuda_version_and_device_count() -> Tuple[str, int]: return ("", 0) cv_element = cv[0].firstChild - if isinstance(cv_element, xml.dom.minidom.Text) and isinstance( - ag_element, xml.dom.minidom.Text - ): - return (cv_element.data, int(ag_element.data)) + if isinstance(cv_element, xml.dom.minidom.Text): + return (cv_element.data, count) _logger.warning( - "Error checking CUDA version with nvidia-smi. " - "Either 'attached_gpus' or 'cuda_version' was not a text node: %s", + "Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node: %s", out, ) return ("", 0) diff --git a/tests/test_cuda.py b/tests/test_cuda.py index e8de7cd63..b737538a1 100644 --- a/tests/test_cuda.py +++ b/tests/test_cuda.py @@ -90,8 +90,11 @@ def _makebuilder(cudaReq: CWLObjectType) -> Builder: @mock.patch("subprocess.check_output") +@mock.patch("cwltool.cuda.cuda_device_count") @mock.patch("os.makedirs") -def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None: +def test_cuda_job_setup_check( + makedirs: MagicMock, count: MagicMock, check_output: MagicMock +) -> None: runtime_context = RuntimeContext({}) cudaReq: CWLObjectType = { @@ -101,11 +104,9 @@ def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> N } builder = _makebuilder(cudaReq) + count.return_value = "1" check_output.return_value = """ - -1 1.0 - """ jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "") @@ -113,62 +114,33 @@ def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> N @mock.patch("subprocess.check_output") +@mock.patch("cwltool.cuda.cuda_device_count") @mock.patch("os.makedirs") -def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None: - runtime_context = RuntimeContext({}) - - cudaReq: CWLObjectType = { - "class": "http://commonwl.org/cwltool#CUDARequirement", - "cudaVersionMin": "2.0", - "cudaComputeCapability": "1.0", - } - builder = _makebuilder(cudaReq) - - check_output.return_value = """ - -1 -1.0 - -""" - jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "") - with pytest.raises(WorkflowException): - jb._setup(runtime_context) - - -@mock.patch("subprocess.check_output") -@mock.patch("os.makedirs") -def test_cuda_job_setup_check_err_empty_attached_gpus( - makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture +def test_cuda_job_setup_check_err( + makedirs: MagicMock, count: MagicMock, check_output: MagicMock ) -> None: runtime_context = RuntimeContext({}) cudaReq: CWLObjectType = { "class": "http://commonwl.org/cwltool#CUDARequirement", - "cudaVersionMin": "1.0", + "cudaVersionMin": "2.0", "cudaComputeCapability": "1.0", } builder = _makebuilder(cudaReq) + count.return_value = "1" check_output.return_value = """ - - 1.0 - """ - jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "") with pytest.raises(WorkflowException): jb._setup(runtime_context) - assert ( - "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty." - in caplog.text - ) -@mock.patch("subprocess.check_output") +@mock.patch("cwltool.cuda.cuda_device_count") @mock.patch("os.makedirs") -def test_cuda_job_setup_check_err_empty_missing_attached_gpus( - makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture +def test_cuda_job_setup_check_err_missing_query_gpu_count( + makedirs: MagicMock, count: MagicMock, caplog: pytest.LogCaptureFixture ) -> None: runtime_context = RuntimeContext({}) @@ -179,25 +151,19 @@ def test_cuda_job_setup_check_err_empty_missing_attached_gpus( } builder = _makebuilder(cudaReq) - check_output.return_value = """ - -1.0 - -""" + count.return_value = "" jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "") with pytest.raises(WorkflowException): jb._setup(runtime_context) - assert ( - "Error checking CUDA version with nvidia-smi. Missing 'attached_gpus' or it is empty." - in caplog.text - ) + assert "invalid literal for int() with base 10" in caplog.text @mock.patch("subprocess.check_output") +@mock.patch("cwltool.cuda.cuda_device_count") @mock.patch("os.makedirs") def test_cuda_job_setup_check_err_empty_cuda_version( - makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture + makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture ) -> None: runtime_context = RuntimeContext({}) @@ -208,11 +174,9 @@ def test_cuda_job_setup_check_err_empty_cuda_version( } builder = _makebuilder(cudaReq) + count.return_value = "1" check_output.return_value = """ - -1 - """ jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "") @@ -225,9 +189,10 @@ def test_cuda_job_setup_check_err_empty_cuda_version( @mock.patch("subprocess.check_output") +@mock.patch("cwltool.cuda.cuda_device_count") @mock.patch("os.makedirs") def test_cuda_job_setup_check_err_missing_cuda_version( - makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture + makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture ) -> None: runtime_context = RuntimeContext({}) @@ -238,25 +203,20 @@ def test_cuda_job_setup_check_err_missing_cuda_version( } builder = _makebuilder(cudaReq) - check_output.return_value = """ - -1 - -""" + count.return_value = "1" + check_output.return_value = "" jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "") with pytest.raises(WorkflowException): jb._setup(runtime_context) - assert ( - "Error checking CUDA version with nvidia-smi. Missing 'cuda_version' or it is empty." - in caplog.text - ) + assert "Error parsing XML stdout of nvidia-smi" in caplog.text @mock.patch("subprocess.check_output") +@mock.patch("cwltool.cuda.cuda_device_count") @mock.patch("os.makedirs") def test_cuda_job_setup_check_err_wrong_type_cuda_version( - makedirs: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture + makedirs: MagicMock, count: MagicMock, check_output: MagicMock, caplog: pytest.LogCaptureFixture ) -> None: runtime_context = RuntimeContext({}) @@ -267,19 +227,17 @@ def test_cuda_job_setup_check_err_wrong_type_cuda_version( } builder = _makebuilder(cudaReq) + count.return_value = "1" check_output.return_value = """ - -1 - """ jb = CommandLineJob(builder, {}, CommandLineTool.make_path_mapper, [], [], "") with pytest.raises(WorkflowException): jb._setup(runtime_context) assert ( - "Error checking CUDA version with nvidia-smi. " - "Either 'attached_gpus' or 'cuda_version' was not a text node" in caplog.text + "Error checking CUDA version with nvidia-smi. 'cuda_version' was not a text node" + in caplog.text ) diff --git a/tests/wf/nvidia-smi-container.cwl b/tests/wf/nvidia-smi-container.cwl index 84fd72d83..40d6fba98 100644 --- a/tests/wf/nvidia-smi-container.cwl +++ b/tests/wf/nvidia-smi-container.cwl @@ -7,7 +7,7 @@ requirements: cudaVersionMin: "1.0" cudaComputeCapability: "1.0" DockerRequirement: - dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04" + dockerPull: "nvidia/cuda:11.4.3-runtime-ubuntu20.04" inputs: [] outputs: [] # Assume this will exit non-zero (resulting in a failing test case) if From 239ce5a5c9085748d486e886231508c9568492dc Mon Sep 17 00:00:00 2001 From: Jake Fennick Date: Tue, 3 Oct 2023 13:18:49 -0600 Subject: [PATCH 2/2] Only need first line of --query-gpu=count --- cwltool/cuda.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cwltool/cuda.py b/cwltool/cuda.py index 7217434e6..2fd884aed 100644 --- a/cwltool/cuda.py +++ b/cwltool/cuda.py @@ -19,7 +19,8 @@ def cuda_device_count() -> str: except Exception as e: _logger.warning("Error checking number of GPUs with nvidia-smi: %s", e) return "0" - return proc.stdout.decode("utf-8") + # NOTE: On a machine with N GPUs the query return N lines, each containing N. + return proc.stdout.decode("utf-8").split("\n")[0] def cuda_version_and_device_count() -> Tuple[str, int]: