Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions gcm/health_checks/checks/check_dcgmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,20 @@ def get_nvlink_status_report(
return piped_shell_command(cmd, timeout_secs)


def _get_test_status(test: dict) -> str:
"""Get the overall status from a test result.

DCGM 4.x uses test_summary.status for aggregated results.
DCGM 3.x uses results[0].status.
"""
if "test_summary" in test and "status" in test["test_summary"]:
return test["test_summary"]["status"]
# Fallback for DCGM 3.x format
if test.get("results") and len(test["results"]) > 0:
return test["results"][0].get("status", "")
return ""


def process_dcgmi_diag_output(
output: str, error_code: int, exclude_category: List[str]
) -> Tuple[ExitCode, str]:
Expand All @@ -208,25 +222,33 @@ def process_dcgmi_diag_output(
return ExitCode.WARN, "dcgmi diag FAILED to execute.\n"

output_dict = json.loads(output)
# DCGM 3.x uses "DCGM GPU Diagnostic", DCGM 4.x uses "DCGM Diagnostic"
diag_key = None
for key in ["DCGM GPU Diagnostic", "DCGM Diagnostic"]:
if key in output_dict:
diag_key = key
break

if (
len(output_dict) == 0
or "DCGM GPU Diagnostic" not in output_dict
or "test_categories" not in output_dict["DCGM GPU Diagnostic"]
or diag_key is None
or "test_categories" not in output_dict[diag_key]
):
return ExitCode.WARN, "dcgmi diag FAILED to execute.\n"

msg: str = ""
exit_code: ExitCode = ExitCode.OK

for category in output_dict["DCGM GPU Diagnostic"]["test_categories"]:
for category in output_dict[diag_key]["test_categories"]:
for test in category["tests"]:
if test["name"] in exclude_category:
continue

if test["results"][0]["status"] == "Fail":
status = _get_test_status(test)
if status == "Fail":
msg += f"{test['name']} failed.\n"
exit_code = ExitCode.CRITICAL
elif test["results"][0]["status"] == "Warn":
elif status == "Warn":
msg += f"{test['name']} warning.\n"
if exit_code < ExitCode.WARN:
exit_code = ExitCode.WARN
Expand Down
16 changes: 16 additions & 0 deletions gcm/tests/health_checks_tests/test_check_dcgmi.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,17 @@ def dcgmi_shell_command_tester(
0,
'{"DCGM GPU Diagnostic": {"test_categories": [{"category": "Deployment", "tests": [{"name": "Blacklist", "results": [{"status": "Pass"}]}, {"name": "NVML Library", "results": [{"status": "Pass"}]}, {"name": "CUDA Main Library", "results": [{"status": "Pass"}]}, {"name": "Permissions and OS Blocks", "results": [{"status": "Pass"}]}, {"name": "Persistence Mode", "results": [{"status": "Pass"}]}, {"name": "Environment Variables", "results": [{"status": "Pass"}]}, {"name": "Page Retirement/Row Remap", "results": [{"status": "Pass"}]}, {"name": "Graphics Processes", "results": [{"status": "Pass"}]}, {"name": "Inforom", "results": [{"status": "Pass"}]}]}]}}',
)
# DCGM 4.x uses "DCGM Diagnostic" instead of "DCGM GPU Diagnostic" and has test_summary
diag_pass_output_v4 = FakeShellCommandOut(
[],
0,
'{"DCGM Diagnostic": {"test_categories": [{"category": "Deployment", "tests": [{"name": "software", "results": [{"entity_group": "GPU", "entity_group_id": 1, "entity_id": 0, "status": "Pass"}], "test_summary": {"status": "Pass"}}]}]}}',
)
diag_fail_output_v4 = FakeShellCommandOut(
[],
0,
'{"DCGM Diagnostic": {"test_categories": [{"category": "Deployment", "tests": [{"name": "software", "results": [{"entity_group": "GPU", "entity_group_id": 1, "entity_id": 0, "status": "Fail"}], "test_summary": {"status": "Fail"}}]}]}}',
)
diag_fail_output = FakeShellCommandOut(
[],
0,
Expand All @@ -77,13 +88,18 @@ def dcgmi_shell_command_tester(
"dcgmi_shell_command_tester, expected",
[
(diag_pass_output, (ExitCode.OK, "All checks passed")),
(diag_pass_output_v4, (ExitCode.OK, "All checks passed")), # DCGM 4.x format
(
diag_fail_output,
(
ExitCode.CRITICAL,
"Persistence Mode failed.\nEnvironment Variables warning",
),
),
(
diag_fail_output_v4,
(ExitCode.CRITICAL, "software failed"),
), # DCGM 4.x fail format
(diag_warn_output, (ExitCode.WARN, "Persistence Mode warning")),
(empty_output, (ExitCode.WARN, "dcgmi diag FAILED to execute")),
(
Expand Down