diff --git a/gcm/health_checks/checks/check_dcgmi.py b/gcm/health_checks/checks/check_dcgmi.py index ac28743..4a05883 100644 --- a/gcm/health_checks/checks/check_dcgmi.py +++ b/gcm/health_checks/checks/check_dcgmi.py @@ -196,6 +196,20 @@ def get_nvlink_status_report( return piped_shell_command(cmd, timeout_secs) +def _get_test_status(test: dict) -> str: + """Get the overall status from a test result. + + DCGM 4.x uses test_summary.status for aggregated results. + DCGM 3.x uses results[0].status. + """ + if "test_summary" in test and "status" in test["test_summary"]: + return test["test_summary"]["status"] + # Fallback for DCGM 3.x format + if test.get("results") and len(test["results"]) > 0: + return test["results"][0].get("status", "") + return "" + + def process_dcgmi_diag_output( output: str, error_code: int, exclude_category: List[str] ) -> Tuple[ExitCode, str]: @@ -208,25 +222,33 @@ def process_dcgmi_diag_output( return ExitCode.WARN, "dcgmi diag FAILED to execute.\n" output_dict = json.loads(output) + # DCGM 3.x uses "DCGM GPU Diagnostic", DCGM 4.x uses "DCGM Diagnostic" + diag_key = None + for key in ["DCGM GPU Diagnostic", "DCGM Diagnostic"]: + if key in output_dict: + diag_key = key + break + if ( len(output_dict) == 0 - or "DCGM GPU Diagnostic" not in output_dict - or "test_categories" not in output_dict["DCGM GPU Diagnostic"] + or diag_key is None + or "test_categories" not in output_dict[diag_key] ): return ExitCode.WARN, "dcgmi diag FAILED to execute.\n" msg: str = "" exit_code: ExitCode = ExitCode.OK - for category in output_dict["DCGM GPU Diagnostic"]["test_categories"]: + for category in output_dict[diag_key]["test_categories"]: for test in category["tests"]: if test["name"] in exclude_category: continue - if test["results"][0]["status"] == "Fail": + status = _get_test_status(test) + if status == "Fail": msg += f"{test['name']} failed.\n" exit_code = ExitCode.CRITICAL - elif test["results"][0]["status"] == "Warn": + elif status == "Warn": msg += f"{test['name']} warning.\n" if exit_code < ExitCode.WARN: exit_code = ExitCode.WARN diff --git a/gcm/tests/health_checks_tests/test_check_dcgmi.py b/gcm/tests/health_checks_tests/test_check_dcgmi.py index 51d8d87..97e59ca 100644 --- a/gcm/tests/health_checks_tests/test_check_dcgmi.py +++ b/gcm/tests/health_checks_tests/test_check_dcgmi.py @@ -54,6 +54,17 @@ def dcgmi_shell_command_tester( 0, '{"DCGM GPU Diagnostic": {"test_categories": [{"category": "Deployment", "tests": [{"name": "Blacklist", "results": [{"status": "Pass"}]}, {"name": "NVML Library", "results": [{"status": "Pass"}]}, {"name": "CUDA Main Library", "results": [{"status": "Pass"}]}, {"name": "Permissions and OS Blocks", "results": [{"status": "Pass"}]}, {"name": "Persistence Mode", "results": [{"status": "Pass"}]}, {"name": "Environment Variables", "results": [{"status": "Pass"}]}, {"name": "Page Retirement/Row Remap", "results": [{"status": "Pass"}]}, {"name": "Graphics Processes", "results": [{"status": "Pass"}]}, {"name": "Inforom", "results": [{"status": "Pass"}]}]}]}}', ) +# DCGM 4.x uses "DCGM Diagnostic" instead of "DCGM GPU Diagnostic" and has test_summary +diag_pass_output_v4 = FakeShellCommandOut( + [], + 0, + '{"DCGM Diagnostic": {"test_categories": [{"category": "Deployment", "tests": [{"name": "software", "results": [{"entity_group": "GPU", "entity_group_id": 1, "entity_id": 0, "status": "Pass"}], "test_summary": {"status": "Pass"}}]}]}}', +) +diag_fail_output_v4 = FakeShellCommandOut( + [], + 0, + '{"DCGM Diagnostic": {"test_categories": [{"category": "Deployment", "tests": [{"name": "software", "results": [{"entity_group": "GPU", "entity_group_id": 1, "entity_id": 0, "status": "Fail"}], "test_summary": {"status": "Fail"}}]}]}}', +) diag_fail_output = FakeShellCommandOut( [], 0, @@ -77,6 +88,7 @@ def dcgmi_shell_command_tester( "dcgmi_shell_command_tester, expected", [ (diag_pass_output, (ExitCode.OK, "All checks passed")), + (diag_pass_output_v4, (ExitCode.OK, "All checks passed")), # DCGM 4.x format ( diag_fail_output, ( @@ -84,6 +96,10 @@ def dcgmi_shell_command_tester( "Persistence Mode failed.\nEnvironment Variables warning", ), ), + ( + diag_fail_output_v4, + (ExitCode.CRITICAL, "software failed"), + ), # DCGM 4.x fail format (diag_warn_output, (ExitCode.WARN, "Persistence Mode warning")), (empty_output, (ExitCode.WARN, "dcgmi diag FAILED to execute")), (