From 419598fbe62e3479d85aba6e1e0f0e82ff5d797b Mon Sep 17 00:00:00 2001 From: "yongl@yongl-login-0.yongl-login.tenant-slurm.svc.cluster.local" <> Date: Mon, 2 Feb 2026 18:14:54 +0000 Subject: [PATCH 1/4] [sdiag] Add sdiag telemetry --- gcm/monitoring/slurm/client.py | 88 ++- gcm/schemas/slurm/sdiag.py | 24 + gcm/tests/data/sample-sdiag-output.json | 925 ++++++++++++++++++++++++ gcm/tests/test_slurm.py | 109 ++- 4 files changed, 1137 insertions(+), 9 deletions(-) create mode 100644 gcm/tests/data/sample-sdiag-output.json diff --git a/gcm/monitoring/slurm/client.py b/gcm/monitoring/slurm/client.py index ce42cc6..b8c1a2b 100644 --- a/gcm/monitoring/slurm/client.py +++ b/gcm/monitoring/slurm/client.py @@ -177,15 +177,39 @@ def sdiag_structured(self) -> Sdiag: sdiag_output = json.loads( subprocess.check_output(["sdiag", "--all", "--json"], text=True) ) - - return Sdiag( - server_thread_count=sdiag_output["statistics"]["server_thread_count"], - agent_queue_size=sdiag_output["statistics"]["agent_queue_size"], - agent_count=sdiag_output["statistics"]["agent_count"], - agent_thread_count=sdiag_output["statistics"]["agent_thread_count"], - dbd_agent_queue_size=sdiag_output["statistics"]["dbd_agent_queue_size"], + stats = sdiag_output["statistics"] + + result = Sdiag( + server_thread_count=stats.get("server_thread_count"), + agent_queue_size=stats.get("agent_queue_size"), + agent_count=stats.get("agent_count"), + agent_thread_count=stats.get("agent_thread_count"), + dbd_agent_queue_size=stats.get("dbd_agent_queue_size"), + schedule_cycle_max=stats.get("schedule_cycle_max"), + schedule_cycle_mean=stats.get("schedule_cycle_mean"), + schedule_cycle_sum=stats.get("schedule_cycle_sum"), + schedule_cycle_total=stats.get("schedule_cycle_total"), + schedule_cycle_per_minute=stats.get("schedule_cycle_per_minute"), + schedule_queue_length=stats.get("schedule_queue_length"), + sdiag_jobs_submitted=stats.get("jobs_submitted"), + sdiag_jobs_started=stats.get("jobs_started"), + sdiag_jobs_completed=stats.get("jobs_completed"), + sdiag_jobs_canceled=stats.get("jobs_canceled"), + sdiag_jobs_failed=stats.get("jobs_failed"), + sdiag_jobs_pending=stats.get("jobs_pending"), + sdiag_jobs_running=stats.get("jobs_running"), + bf_backfilled_jobs=stats.get("bf_backfilled_jobs"), + bf_cycle_mean=stats.get("bf_cycle_mean"), + bf_cycle_sum=stats.get("bf_cycle_sum"), + bf_cycle_max=stats.get("bf_cycle_max"), + bf_queue_len=stats.get("bf_queue_len"), ) + # Reset sdiag counters after collection + self._reset_sdiag_counters() + + return result + sdiag_output = subprocess.check_output(["sdiag", "--all"], text=True) metric_names = { "Server thread count:": "server_thread_count", @@ -194,7 +218,7 @@ def sdiag_structured(self) -> Sdiag: "Agent thread count:": "agent_thread_count", "DBD Agent queue size:": "dbd_agent_queue_size", } - data = { + data: dict[str, Optional[int]] = { "server_thread_count": 0, "agent_queue_size": 0, "agent_count": 0, @@ -206,8 +230,56 @@ def sdiag_structured(self) -> Sdiag: lines = re.search(rf".*{sdiag_name}.*", sdiag_output) assert lines is not None, f"Sdiag metric {sdiag_name} not found: {lines}" data[name] = int(lines.group().strip(f"{sdiag_name}")) + + optional_metric_names = { + "Schedule cycle max:": "schedule_cycle_max", + "Schedule cycle mean:": "schedule_cycle_mean", + "Schedule cycle sum:": "schedule_cycle_sum", + "Schedule cycle total:": "schedule_cycle_total", + "Schedule cycle per minute:": "schedule_cycle_per_minute", + "Schedule queue length:": "schedule_queue_length", + "Jobs submitted:": "sdiag_jobs_submitted", + "Jobs started:": "sdiag_jobs_started", + "Jobs completed:": "sdiag_jobs_completed", + "Jobs canceled:": "sdiag_jobs_canceled", + "Jobs failed:": "sdiag_jobs_failed", + "Jobs pending:": "sdiag_jobs_pending", + "Jobs running:": "sdiag_jobs_running", + "Total backfilled jobs \\(since last slurm start\\):": "bf_backfilled_jobs", + "Backfill cycle mean:": "bf_cycle_mean", + "Backfill cycle sum:": "bf_cycle_sum", + "Backfill cycle max:": "bf_cycle_max", + "Backfill queue length:": "bf_queue_len", + } + + for sdiag_name, name in optional_metric_names.items(): + match = re.search(rf"{sdiag_name}\s*(\d+)", sdiag_output) + if match: + data[name] = int(match.group(1)) + else: + data[name] = None + + # Reset sdiag counters after collection + self._reset_sdiag_counters() + return Sdiag(**data) + def _reset_sdiag_counters(self) -> None: + """Reset sdiag counters after collection. + + This requires appropriate permissions (typically root or SlurmUser). + If the reset fails due to permission issues, a warning is logged. + """ + try: + subprocess.run( + ["sdiag", "--reset"], + check=True, + capture_output=True, + text=True, + ) + except subprocess.CalledProcessError as e: + logger.warning(f"Failed to reset sdiag counters: {e.stderr.strip()}") + def sinfo_structured(self) -> Sinfo: fieldnames = [f.name for f in fields(SinfoRow)] diff --git a/gcm/schemas/slurm/sdiag.py b/gcm/schemas/slurm/sdiag.py index 60aaecd..7703713 100644 --- a/gcm/schemas/slurm/sdiag.py +++ b/gcm/schemas/slurm/sdiag.py @@ -11,3 +11,27 @@ class Sdiag: agent_count: Optional[int] agent_thread_count: Optional[int] dbd_agent_queue_size: Optional[int] + + # Schedule cycle statistics + schedule_cycle_max: Optional[int] = None + schedule_cycle_mean: Optional[int] = None + schedule_cycle_sum: Optional[int] = None + schedule_cycle_total: Optional[int] = None + schedule_cycle_per_minute: Optional[int] = None + schedule_queue_length: Optional[int] = None + + # Job statistics (prefixed with sdiag_ to avoid collision with SLURMLog) + sdiag_jobs_submitted: Optional[int] = None + sdiag_jobs_started: Optional[int] = None + sdiag_jobs_completed: Optional[int] = None + sdiag_jobs_canceled: Optional[int] = None + sdiag_jobs_failed: Optional[int] = None + sdiag_jobs_pending: Optional[int] = None + sdiag_jobs_running: Optional[int] = None + + # Backfill statistics + bf_backfilled_jobs: Optional[int] = None + bf_cycle_mean: Optional[int] = None + bf_cycle_sum: Optional[int] = None + bf_cycle_max: Optional[int] = None + bf_queue_len: Optional[int] = None diff --git a/gcm/tests/data/sample-sdiag-output.json b/gcm/tests/data/sample-sdiag-output.json new file mode 100644 index 0000000..8e1ef3b --- /dev/null +++ b/gcm/tests/data/sample-sdiag-output.json @@ -0,0 +1,925 @@ + +{ + "statistics": { + "parts_packed": 1, + "req_time": { + "set": true, + "infinite": false, + "number": 1769652127 + }, + "req_time_start": { + "set": true, + "infinite": false, + "number": 1769650551 + }, + "server_thread_count": 4, + "agent_queue_size": 5, + "agent_count": 3, + "agent_thread_count": 8, + "dbd_agent_queue_size": 2, + "gettimeofday_latency": 26, + "schedule_cycle_max": 2788800, + "schedule_cycle_last": 2258381, + "schedule_cycle_sum": 582130236, + "schedule_cycle_total": 335, + "schedule_cycle_mean": 1737702, + "schedule_cycle_mean_depth": 28, + "schedule_cycle_per_minute": 12, + "schedule_cycle_depth": 9466, + "schedule_exit": { + "end_job_queue": 54, + "default_queue_depth": 0, + "max_job_start": 0, + "max_rpc_cnt": 0, + "max_sched_time": 281, + "licenses": 0 + }, + "schedule_queue_length": 407, + "jobs_submitted": 504, + "jobs_started": 579, + "jobs_completed": 524, + "jobs_canceled": 20, + "jobs_failed": 0, + "jobs_pending": 20725, + "jobs_running": 3273, + "job_states_ts": { + "set": true, + "infinite": false, + "number": 1769652102 + }, + "bf_backfilled_jobs": 287, + "bf_last_backfilled_jobs": 287, + "bf_backfilled_het_jobs": 0, + "bf_cycle_counter": 10, + "bf_cycle_mean": 37143463, + "bf_depth_mean": 448, + "bf_depth_mean_try": 230, + "bf_cycle_sum": 371434634, + "bf_cycle_last": 46552416, + "bf_cycle_max": 47125449, + "bf_exit": { + "end_job_queue": 10, + "bf_max_job_start": 0, + "bf_max_job_test": 0, + "bf_max_time": 0, + "bf_node_space_size": 0, + "state_changed": 0 + }, + "bf_last_depth": 353, + "bf_last_depth_try": 165, + "bf_depth_sum": 4480, + "bf_depth_try_sum": 2301, + "bf_queue_len": 411, + "bf_queue_len_mean": 453, + "bf_queue_len_sum": 4537, + "bf_table_size": 12, + "bf_table_size_sum": 104, + "bf_table_size_mean": 453, + "bf_when_last_cycle": { + "set": true, + "infinite": false, + "number": 1769652085 + }, + "bf_active": true, + "rpcs_by_message_type": [ + { + "type_id": 2049, + "message_type": "REQUEST_FED_INFO", + "count": 5510, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 466841, + "average_time": { + "set": true, + "infinite": false, + "number": 84 + } + }, + { + "type_id": 3001, + "message_type": "REQUEST_UPDATE_JOB", + "count": 198, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 57498589, + "average_time": { + "set": true, + "infinite": false, + "number": 290396 + } + }, + { + "type_id": 2009, + "message_type": "REQUEST_PARTITION_INFO", + "count": 4165, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 423385, + "average_time": { + "set": true, + "infinite": false, + "number": 101 + } + }, + { + "type_id": 2001, + "message_type": "REQUEST_BUILD_INFO", + "count": 119, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 46071705, + "average_time": { + "set": true, + "infinite": false, + "number": 387157 + } + }, + { + "type_id": 5005, + "message_type": "REQUEST_CANCEL_JOB_STEP", + "count": 33, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 10129846, + "average_time": { + "set": true, + "infinite": false, + "number": 306965 + } + }, + { + "type_id": 1008, + "message_type": "REQUEST_PING", + "count": 614, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 48074, + "average_time": { + "set": true, + "infinite": false, + "number": 78 + } + }, + { + "type_id": 2021, + "message_type": "REQUEST_JOB_INFO_SINGLE", + "count": 3049, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 1229351085, + "average_time": { + "set": true, + "infinite": false, + "number": 403198 + } + }, + { + "type_id": 6500, + "message_type": "REQUEST_PERSIST_INIT", + "count": 31, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 3718, + "average_time": { + "set": true, + "infinite": false, + "number": 119 + } + }, + { + "type_id": 2040, + "message_type": "REQUEST_NODE_INFO_SINGLE", + "count": 4683, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 571743838, + "average_time": { + "set": true, + "infinite": false, + "number": 122089 + } + }, + { + "type_id": 2039, + "message_type": "REQUEST_JOB_USER_INFO", + "count": 2453, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 408696689, + "average_time": { + "set": true, + "infinite": false, + "number": 166610 + } + }, + { + "type_id": 5018, + "message_type": "REQUEST_COMPLETE_BATCH_SCRIPT", + "count": 524, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 204218460, + "average_time": { + "set": true, + "infinite": false, + "number": 389729 + } + }, + { + "type_id": 5016, + "message_type": "REQUEST_STEP_COMPLETE", + "count": 767, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 149281139, + "average_time": { + "set": true, + "infinite": false, + "number": 194629 + } + }, + { + "type_id": 5023, + "message_type": "REQUEST_JOB_REQUEUE", + "count": 24, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 1044426, + "average_time": { + "set": true, + "infinite": false, + "number": 43517 + } + }, + { + "type_id": 6012, + "message_type": "MESSAGE_EPILOG_COMPLETE", + "count": 589, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 74296558, + "average_time": { + "set": true, + "infinite": false, + "number": 126140 + } + }, + { + "type_id": 2007, + "message_type": "REQUEST_NODE_INFO", + "count": 2985, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 638923002, + "average_time": { + "set": true, + "infinite": false, + "number": 214044 + } + }, + { + "type_id": 1002, + "message_type": "MESSAGE_NODE_REGISTRATION_STATUS", + "count": 5872, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 340069073, + "average_time": { + "set": true, + "infinite": false, + "number": 57913 + } + }, + { + "type_id": 6018, + "message_type": "REQUEST_COMPLETE_PROLOG", + "count": 644, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 72557843, + "average_time": { + "set": true, + "infinite": false, + "number": 112667 + } + }, + { + "type_id": 2003, + "message_type": "REQUEST_JOB_INFO", + "count": 535, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 205329065, + "average_time": { + "set": true, + "infinite": false, + "number": 383792 + } + }, + { + "type_id": 2035, + "message_type": "REQUEST_STATS_INFO", + "count": 246, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 23673, + "average_time": { + "set": true, + "infinite": false, + "number": 96 + } + }, + { + "type_id": 4027, + "message_type": "REQUEST_HET_JOB_ALLOC_INFO", + "count": 236, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 90707386, + "average_time": { + "set": true, + "infinite": false, + "number": 384353 + } + }, + { + "type_id": 5001, + "message_type": "REQUEST_JOB_STEP_CREATE", + "count": 255, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 113836418, + "average_time": { + "set": true, + "infinite": false, + "number": 446417 + } + }, + { + "type_id": 5017, + "message_type": "REQUEST_COMPLETE_JOB_ALLOCATION", + "count": 14, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 5658245, + "average_time": { + "set": true, + "infinite": false, + "number": 404160 + } + }, + { + "type_id": 3002, + "message_type": "REQUEST_UPDATE_NODE", + "count": 5, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 3476, + "average_time": { + "set": true, + "infinite": false, + "number": 695 + } + }, + { + "type_id": 5032, + "message_type": "REQUEST_KILL_JOB", + "count": 20, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 13528813, + "average_time": { + "set": true, + "infinite": false, + "number": 676440 + } + }, + { + "type_id": 4001, + "message_type": "REQUEST_RESOURCE_ALLOCATION", + "count": 7, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 8476485, + "average_time": { + "set": true, + "infinite": false, + "number": 1210926 + } + }, + { + "type_id": 4012, + "message_type": "REQUEST_JOB_WILL_RUN", + "count": 5, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 6832810, + "average_time": { + "set": true, + "infinite": false, + "number": 1366562 + } + }, + { + "type_id": 4019, + "message_type": "REQUEST_JOB_READY", + "count": 28, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 13036751, + "average_time": { + "set": true, + "infinite": false, + "number": 465598 + } + }, + { + "type_id": 4003, + "message_type": "REQUEST_SUBMIT_BATCH_JOB", + "count": 40, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 24094071, + "average_time": { + "set": true, + "infinite": false, + "number": 602351 + } + }, + { + "type_id": 10001, + "message_type": "ACCOUNTING_UPDATE_MSG", + "count": 94, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 2286, + "average_time": { + "set": true, + "infinite": false, + "number": 24 + } + }, + { + "type_id": 3014, + "message_type": "REQUEST_DELETE_NODE", + "count": 35, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 31360369, + "average_time": { + "set": true, + "infinite": false, + "number": 896010 + } + } + ], + "rpcs_by_user": [ + { + "user_id": 0, + "user": "root", + "count": 22507, + "total_time": 2364143744, + "average_time": { + "set": true, + "infinite": false, + "number": 105040 + } + }, + { + "user_id": 705806, + "user": "guillemram", + "count": 1248, + "total_time": 263479187, + "average_time": { + "set": true, + "infinite": false, + "number": 211121 + } + }, + { + "user_id": 673564, + "user": "seitzer", + "count": 612, + "total_time": 115432095, + "average_time": { + "set": true, + "infinite": false, + "number": 188614 + } + }, + { + "user_id": 205509, + "user": "ruanrms", + "count": 63, + "total_time": 3470756, + "average_time": { + "set": true, + "infinite": false, + "number": 55091 + } + }, + { + "user_id": 717631, + "user": "rrmaura", + "count": 90, + "total_time": 15877823, + "average_time": { + "set": true, + "infinite": false, + "number": 176420 + } + }, + { + "user_id": 657536, + "user": "gaoyuezhou", + "count": 458, + "total_time": 63691863, + "average_time": { + "set": true, + "infinite": false, + "number": 139065 + } + }, + { + "user_id": 705451, + "user": "cyclin", + "count": 1730, + "total_time": 247078993, + "average_time": { + "set": true, + "infinite": false, + "number": 142820 + } + }, + { + "user_id": 696463, + "user": "shimon", + "count": 148, + "total_time": 27280053, + "average_time": { + "set": true, + "infinite": false, + "number": 184324 + } + }, + { + "user_id": 700291, + "user": "shashankrajput", + "count": 1, + "total_time": 146, + "average_time": { + "set": true, + "infinite": false, + "number": 146 + } + }, + { + "user_id": 660884, + "user": "rulin", + "count": 1635, + "total_time": 42202940, + "average_time": { + "set": true, + "infinite": false, + "number": 25812 + } + }, + { + "user_id": 401, + "user": "slurm", + "count": 253, + "total_time": 14225, + "average_time": { + "set": true, + "infinite": false, + "number": 56 + } + }, + { + "user_id": 30000000, + "user": "generatedunixname677578511809727", + "count": 100, + "total_time": 38830225, + "average_time": { + "set": true, + "infinite": false, + "number": 388302 + } + }, + { + "user_id": 710110, + "user": "nelliewu", + "count": 20, + "total_time": 1193380, + "average_time": { + "set": true, + "infinite": false, + "number": 59669 + } + }, + { + "user_id": 369025, + "user": "meghanam", + "count": 12, + "total_time": 7214027, + "average_time": { + "set": true, + "infinite": false, + "number": 601168 + } + }, + { + "user_id": 726327, + "user": "harleenhanspal", + "count": 36, + "total_time": 14953780, + "average_time": { + "set": true, + "infinite": false, + "number": 415382 + } + }, + { + "user_id": 434845, + "user": "xuanyizhao", + "count": 166, + "total_time": 37911197, + "average_time": { + "set": true, + "infinite": false, + "number": 228380 + } + }, + { + "user_id": 605412, + "user": "jcaudet", + "count": 400, + "total_time": 86316815, + "average_time": { + "set": true, + "infinite": false, + "number": 215792 + } + }, + { + "user_id": 701293, + "user": "zhai", + "count": 22, + "total_time": 6463414, + "average_time": { + "set": true, + "infinite": false, + "number": 293791 + } + }, + { + "user_id": 628377, + "user": "aliciagolden", + "count": 2, + "total_time": 282, + "average_time": { + "set": true, + "infinite": false, + "number": 141 + } + }, + { + "user_id": 710761, + "user": "zliu2346", + "count": 667, + "total_time": 166627089, + "average_time": { + "set": true, + "infinite": false, + "number": 249815 + } + }, + { + "user_id": 133511, + "user": "edinan", + "count": 28, + "total_time": 18070588, + "average_time": { + "set": true, + "infinite": false, + "number": 645378 + } + }, + { + "user_id": 717724, + "user": "chleow89", + "count": 90, + "total_time": 18596809, + "average_time": { + "set": true, + "infinite": false, + "number": 206631 + } + }, + { + "user_id": 563377, + "user": "liamrobbins", + "count": 488, + "total_time": 90969175, + "average_time": { + "set": true, + "infinite": false, + "number": 186412 + } + }, + { + "user_id": 139410, + "user": "ballasn", + "count": 21, + "total_time": 4774646, + "average_time": { + "set": true, + "infinite": false, + "number": 227364 + } + }, + { + "user_id": 718528, + "user": "lcherif", + "count": 2448, + "total_time": 513227787, + "average_time": { + "set": true, + "infinite": false, + "number": 209651 + } + }, + { + "user_id": 658431, + "user": "ahavens", + "count": 480, + "total_time": 144110176, + "average_time": { + "set": true, + "infinite": false, + "number": 300229 + } + }, + { + "user_id": 669117, + "user": "alnicolicioiu", + "count": 3, + "total_time": 337838, + "average_time": { + "set": true, + "infinite": false, + "number": 112612 + } + }, + { + "user_id": 674675, + "user": "charlesarnal", + "count": 3, + "total_time": 1806380, + "average_time": { + "set": true, + "infinite": false, + "number": 602126 + } + }, + { + "user_id": 129954, + "user": "shru", + "count": 23, + "total_time": 11488884, + "average_time": { + "set": true, + "infinite": false, + "number": 499516 + } + }, + { + "user_id": 707882, + "user": "balomari", + "count": 20, + "total_time": 9233327, + "average_time": { + "set": true, + "infinite": false, + "number": 461666 + } + }, + { + "user_id": 723290, + "user": "nikitaagw", + "count": 4, + "total_time": 1826248, + "average_time": { + "set": true, + "infinite": false, + "number": 456562 + } + }, + { + "user_id": 570702, + "user": "luccab", + "count": 2, + "total_time": 1090227, + "average_time": { + "set": true, + "infinite": false, + "number": 545113 + } + } + ], + "pending_rpcs": [ + ], + "pending_rpcs_by_hostlist": [ + ] + }, + "meta": { + "plugin": { + "type": "", + "name": "", + "data_parser": "data_parser\/v0.0.42", + "accounting_storage": "accounting_storage\/slurmdbd" + }, + "client": { + "source": "\/dev\/pts\/0", + "user": "yongl", + "group": "yongl" + }, + "command": [ + "sdiag", + "--all" + ], + "slurm": { + "version": { + "major": "24", + "micro": "5", + "minor": "11" + }, + "release": "24.11.5", + "cluster": "fair-sc" + } + }, + "errors": [ + ], + "warnings": [ + ] +} diff --git a/gcm/tests/test_slurm.py b/gcm/tests/test_slurm.py index 8b2c2bc..91345b6 100644 --- a/gcm/tests/test_slurm.py +++ b/gcm/tests/test_slurm.py @@ -1,10 +1,11 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +import json import logging import subprocess from functools import partial from importlib import resources -from unittest.mock import create_autospec, MagicMock +from unittest.mock import create_autospec, MagicMock, patch import pytest from gcm.monitoring.clock import time_to_time_aware @@ -12,6 +13,7 @@ from gcm.monitoring.slurm.derived_cluster import get_derived_cluster +from gcm.schemas.slurm.sdiag import Sdiag from gcm.schemas.slurm.sinfo import Sinfo from gcm.schemas.slurm.sinfo_node import SinfoNode from gcm.schemas.slurm.squeue import JobData @@ -464,3 +466,108 @@ def test_sinfo_structured(dataset: str, expected: Sinfo) -> None: actual = c.sinfo_structured() assert actual == expected + + @staticmethod + @patch.object(SlurmCliClient, "_reset_sdiag_counters") + @patch("clusterscope.slurm_version") + @patch("subprocess.check_output") + def test_parse_sdiag_json( + mock_check_output: MagicMock, + mock_slurm_version: MagicMock, + mock_reset: MagicMock, + ) -> None: + mock_slurm_version.return_value = (23, 2) + + with resources.open_text(data, "sample-sdiag-output.json") as f: + mock_check_output.return_value = f.read() + + c = SlurmCliClient() + result = c.sdiag_structured() + + expected = Sdiag( + server_thread_count=4, + agent_queue_size=5, + agent_count=3, + agent_thread_count=8, + dbd_agent_queue_size=2, + schedule_cycle_max=2788800, + schedule_cycle_mean=1737702, + schedule_cycle_sum=582130236, + schedule_cycle_total=335, + schedule_cycle_per_minute=12, + schedule_queue_length=407, + sdiag_jobs_submitted=504, + sdiag_jobs_started=579, + sdiag_jobs_completed=524, + sdiag_jobs_canceled=20, + sdiag_jobs_failed=0, + sdiag_jobs_pending=20725, + sdiag_jobs_running=3273, + bf_backfilled_jobs=287, + bf_cycle_mean=37143463, + bf_cycle_sum=371434634, + bf_cycle_max=47125449, + bf_queue_len=411, + ) + + assert result == expected + mock_check_output.assert_called_once_with( + ["sdiag", "--all", "--json"], text=True + ) + mock_reset.assert_called_once() + + @staticmethod + @patch.object(SlurmCliClient, "_reset_sdiag_counters") + @patch("clusterscope.slurm_version") + @patch("subprocess.check_output") + def test_parse_sdiag_json_with_missing_fields( + mock_check_output: MagicMock, + mock_slurm_version: MagicMock, + mock_reset: MagicMock, + ) -> None: + mock_slurm_version.return_value = (23, 2) + + minimal_json = json.dumps( + { + "statistics": { + "server_thread_count": 10, + "agent_queue_size": 5, + "agent_count": 3, + "agent_thread_count": 8, + "dbd_agent_queue_size": 2, + } + } + ) + mock_check_output.return_value = minimal_json + + c = SlurmCliClient() + result = c.sdiag_structured() + + expected = Sdiag( + server_thread_count=10, + agent_queue_size=5, + agent_count=3, + agent_thread_count=8, + dbd_agent_queue_size=2, + schedule_cycle_max=None, + schedule_cycle_mean=None, + schedule_cycle_sum=None, + schedule_cycle_total=None, + schedule_cycle_per_minute=None, + schedule_queue_length=None, + sdiag_jobs_submitted=None, + sdiag_jobs_started=None, + sdiag_jobs_completed=None, + sdiag_jobs_canceled=None, + sdiag_jobs_failed=None, + sdiag_jobs_pending=None, + sdiag_jobs_running=None, + bf_backfilled_jobs=None, + bf_cycle_mean=None, + bf_cycle_sum=None, + bf_cycle_max=None, + bf_queue_len=None, + ) + + assert result == expected + mock_reset.assert_called_once() From e7a0229236d1f274a589affa382f2a8b4f99d008 Mon Sep 17 00:00:00 2001 From: yongl user Date: Wed, 4 Feb 2026 22:14:31 +0000 Subject: [PATCH 2/4] Update json for test --- gcm/tests/data/sample-sdiag-output.json | 335 +----------------------- 1 file changed, 8 insertions(+), 327 deletions(-) diff --git a/gcm/tests/data/sample-sdiag-output.json b/gcm/tests/data/sample-sdiag-output.json index 8e1ef3b..bac1ef6 100644 --- a/gcm/tests/data/sample-sdiag-output.json +++ b/gcm/tests/data/sample-sdiag-output.json @@ -536,7 +536,7 @@ "rpcs_by_user": [ { "user_id": 0, - "user": "root", + "user": "a", "count": 22507, "total_time": 2364143744, "average_time": { @@ -546,8 +546,8 @@ } }, { - "user_id": 705806, - "user": "guillemram", + "user_id": 1, + "user": "b", "count": 1248, "total_time": 263479187, "average_time": { @@ -557,327 +557,8 @@ } }, { - "user_id": 673564, - "user": "seitzer", - "count": 612, - "total_time": 115432095, - "average_time": { - "set": true, - "infinite": false, - "number": 188614 - } - }, - { - "user_id": 205509, - "user": "ruanrms", - "count": 63, - "total_time": 3470756, - "average_time": { - "set": true, - "infinite": false, - "number": 55091 - } - }, - { - "user_id": 717631, - "user": "rrmaura", - "count": 90, - "total_time": 15877823, - "average_time": { - "set": true, - "infinite": false, - "number": 176420 - } - }, - { - "user_id": 657536, - "user": "gaoyuezhou", - "count": 458, - "total_time": 63691863, - "average_time": { - "set": true, - "infinite": false, - "number": 139065 - } - }, - { - "user_id": 705451, - "user": "cyclin", - "count": 1730, - "total_time": 247078993, - "average_time": { - "set": true, - "infinite": false, - "number": 142820 - } - }, - { - "user_id": 696463, - "user": "shimon", - "count": 148, - "total_time": 27280053, - "average_time": { - "set": true, - "infinite": false, - "number": 184324 - } - }, - { - "user_id": 700291, - "user": "shashankrajput", - "count": 1, - "total_time": 146, - "average_time": { - "set": true, - "infinite": false, - "number": 146 - } - }, - { - "user_id": 660884, - "user": "rulin", - "count": 1635, - "total_time": 42202940, - "average_time": { - "set": true, - "infinite": false, - "number": 25812 - } - }, - { - "user_id": 401, - "user": "slurm", - "count": 253, - "total_time": 14225, - "average_time": { - "set": true, - "infinite": false, - "number": 56 - } - }, - { - "user_id": 30000000, - "user": "generatedunixname677578511809727", - "count": 100, - "total_time": 38830225, - "average_time": { - "set": true, - "infinite": false, - "number": 388302 - } - }, - { - "user_id": 710110, - "user": "nelliewu", - "count": 20, - "total_time": 1193380, - "average_time": { - "set": true, - "infinite": false, - "number": 59669 - } - }, - { - "user_id": 369025, - "user": "meghanam", - "count": 12, - "total_time": 7214027, - "average_time": { - "set": true, - "infinite": false, - "number": 601168 - } - }, - { - "user_id": 726327, - "user": "harleenhanspal", - "count": 36, - "total_time": 14953780, - "average_time": { - "set": true, - "infinite": false, - "number": 415382 - } - }, - { - "user_id": 434845, - "user": "xuanyizhao", - "count": 166, - "total_time": 37911197, - "average_time": { - "set": true, - "infinite": false, - "number": 228380 - } - }, - { - "user_id": 605412, - "user": "jcaudet", - "count": 400, - "total_time": 86316815, - "average_time": { - "set": true, - "infinite": false, - "number": 215792 - } - }, - { - "user_id": 701293, - "user": "zhai", - "count": 22, - "total_time": 6463414, - "average_time": { - "set": true, - "infinite": false, - "number": 293791 - } - }, - { - "user_id": 628377, - "user": "aliciagolden", - "count": 2, - "total_time": 282, - "average_time": { - "set": true, - "infinite": false, - "number": 141 - } - }, - { - "user_id": 710761, - "user": "zliu2346", - "count": 667, - "total_time": 166627089, - "average_time": { - "set": true, - "infinite": false, - "number": 249815 - } - }, - { - "user_id": 133511, - "user": "edinan", - "count": 28, - "total_time": 18070588, - "average_time": { - "set": true, - "infinite": false, - "number": 645378 - } - }, - { - "user_id": 717724, - "user": "chleow89", - "count": 90, - "total_time": 18596809, - "average_time": { - "set": true, - "infinite": false, - "number": 206631 - } - }, - { - "user_id": 563377, - "user": "liamrobbins", - "count": 488, - "total_time": 90969175, - "average_time": { - "set": true, - "infinite": false, - "number": 186412 - } - }, - { - "user_id": 139410, - "user": "ballasn", - "count": 21, - "total_time": 4774646, - "average_time": { - "set": true, - "infinite": false, - "number": 227364 - } - }, - { - "user_id": 718528, - "user": "lcherif", - "count": 2448, - "total_time": 513227787, - "average_time": { - "set": true, - "infinite": false, - "number": 209651 - } - }, - { - "user_id": 658431, - "user": "ahavens", - "count": 480, - "total_time": 144110176, - "average_time": { - "set": true, - "infinite": false, - "number": 300229 - } - }, - { - "user_id": 669117, - "user": "alnicolicioiu", - "count": 3, - "total_time": 337838, - "average_time": { - "set": true, - "infinite": false, - "number": 112612 - } - }, - { - "user_id": 674675, - "user": "charlesarnal", - "count": 3, - "total_time": 1806380, - "average_time": { - "set": true, - "infinite": false, - "number": 602126 - } - }, - { - "user_id": 129954, - "user": "shru", - "count": 23, - "total_time": 11488884, - "average_time": { - "set": true, - "infinite": false, - "number": 499516 - } - }, - { - "user_id": 707882, - "user": "balomari", - "count": 20, - "total_time": 9233327, - "average_time": { - "set": true, - "infinite": false, - "number": 461666 - } - }, - { - "user_id": 723290, - "user": "nikitaagw", - "count": 4, - "total_time": 1826248, - "average_time": { - "set": true, - "infinite": false, - "number": 456562 - } - }, - { - "user_id": 570702, - "user": "luccab", + "user_id": 2, + "user": "c", "count": 2, "total_time": 1090227, "average_time": { @@ -901,8 +582,8 @@ }, "client": { "source": "\/dev\/pts\/0", - "user": "yongl", - "group": "yongl" + "user": "x", + "group": "x" }, "command": [ "sdiag", @@ -915,7 +596,7 @@ "minor": "11" }, "release": "24.11.5", - "cluster": "fair-sc" + "cluster": "test-cluster" } }, "errors": [ From 4b8b66b1c7f08c722f1087cf428f90965269ee4a Mon Sep 17 00:00:00 2001 From: "yongl@yongl-login-0.yongl-login.tenant-slurm.svc.cluster.local" <> Date: Thu, 5 Feb 2026 17:21:32 +0000 Subject: [PATCH 3/4] [sdiag] Add sdiag telemetry --- gcm/monitoring/slurm/client.py | 88 ++- gcm/schemas/slurm/sdiag.py | 24 + gcm/tests/data/sample-sdiag-output.json | 925 ++++++++++++++++++++++++ gcm/tests/test_slurm.py | 109 ++- 4 files changed, 1137 insertions(+), 9 deletions(-) create mode 100644 gcm/tests/data/sample-sdiag-output.json diff --git a/gcm/monitoring/slurm/client.py b/gcm/monitoring/slurm/client.py index 7ae5ba4..b99a50a 100644 --- a/gcm/monitoring/slurm/client.py +++ b/gcm/monitoring/slurm/client.py @@ -186,15 +186,39 @@ def sdiag_structured(self) -> Sdiag: sdiag_output = json.loads( subprocess.check_output(["sdiag", "--all", "--json"], text=True) ) - - return Sdiag( - server_thread_count=sdiag_output["statistics"]["server_thread_count"], - agent_queue_size=sdiag_output["statistics"]["agent_queue_size"], - agent_count=sdiag_output["statistics"]["agent_count"], - agent_thread_count=sdiag_output["statistics"]["agent_thread_count"], - dbd_agent_queue_size=sdiag_output["statistics"]["dbd_agent_queue_size"], + stats = sdiag_output["statistics"] + + result = Sdiag( + server_thread_count=stats.get("server_thread_count"), + agent_queue_size=stats.get("agent_queue_size"), + agent_count=stats.get("agent_count"), + agent_thread_count=stats.get("agent_thread_count"), + dbd_agent_queue_size=stats.get("dbd_agent_queue_size"), + schedule_cycle_max=stats.get("schedule_cycle_max"), + schedule_cycle_mean=stats.get("schedule_cycle_mean"), + schedule_cycle_sum=stats.get("schedule_cycle_sum"), + schedule_cycle_total=stats.get("schedule_cycle_total"), + schedule_cycle_per_minute=stats.get("schedule_cycle_per_minute"), + schedule_queue_length=stats.get("schedule_queue_length"), + sdiag_jobs_submitted=stats.get("jobs_submitted"), + sdiag_jobs_started=stats.get("jobs_started"), + sdiag_jobs_completed=stats.get("jobs_completed"), + sdiag_jobs_canceled=stats.get("jobs_canceled"), + sdiag_jobs_failed=stats.get("jobs_failed"), + sdiag_jobs_pending=stats.get("jobs_pending"), + sdiag_jobs_running=stats.get("jobs_running"), + bf_backfilled_jobs=stats.get("bf_backfilled_jobs"), + bf_cycle_mean=stats.get("bf_cycle_mean"), + bf_cycle_sum=stats.get("bf_cycle_sum"), + bf_cycle_max=stats.get("bf_cycle_max"), + bf_queue_len=stats.get("bf_queue_len"), ) + # Reset sdiag counters after collection + self._reset_sdiag_counters() + + return result + sdiag_output = subprocess.check_output(["sdiag", "--all"], text=True) metric_names = { "Server thread count:": "server_thread_count", @@ -203,7 +227,7 @@ def sdiag_structured(self) -> Sdiag: "Agent thread count:": "agent_thread_count", "DBD Agent queue size:": "dbd_agent_queue_size", } - data = { + data: dict[str, Optional[int]] = { "server_thread_count": 0, "agent_queue_size": 0, "agent_count": 0, @@ -215,8 +239,56 @@ def sdiag_structured(self) -> Sdiag: lines = re.search(rf".*{sdiag_name}.*", sdiag_output) assert lines is not None, f"Sdiag metric {sdiag_name} not found: {lines}" data[name] = int(lines.group().strip(f"{sdiag_name}")) + + optional_metric_names = { + "Schedule cycle max:": "schedule_cycle_max", + "Schedule cycle mean:": "schedule_cycle_mean", + "Schedule cycle sum:": "schedule_cycle_sum", + "Schedule cycle total:": "schedule_cycle_total", + "Schedule cycle per minute:": "schedule_cycle_per_minute", + "Schedule queue length:": "schedule_queue_length", + "Jobs submitted:": "sdiag_jobs_submitted", + "Jobs started:": "sdiag_jobs_started", + "Jobs completed:": "sdiag_jobs_completed", + "Jobs canceled:": "sdiag_jobs_canceled", + "Jobs failed:": "sdiag_jobs_failed", + "Jobs pending:": "sdiag_jobs_pending", + "Jobs running:": "sdiag_jobs_running", + "Total backfilled jobs \\(since last slurm start\\):": "bf_backfilled_jobs", + "Backfill cycle mean:": "bf_cycle_mean", + "Backfill cycle sum:": "bf_cycle_sum", + "Backfill cycle max:": "bf_cycle_max", + "Backfill queue length:": "bf_queue_len", + } + + for sdiag_name, name in optional_metric_names.items(): + match = re.search(rf"{sdiag_name}\s*(\d+)", sdiag_output) + if match: + data[name] = int(match.group(1)) + else: + data[name] = None + + # Reset sdiag counters after collection + self._reset_sdiag_counters() + return Sdiag(**data) + def _reset_sdiag_counters(self) -> None: + """Reset sdiag counters after collection. + + This requires appropriate permissions (typically root or SlurmUser). + If the reset fails due to permission issues, a warning is logged. + """ + try: + subprocess.run( + ["sdiag", "--reset"], + check=True, + capture_output=True, + text=True, + ) + except subprocess.CalledProcessError as e: + logger.warning(f"Failed to reset sdiag counters: {e.stderr.strip()}") + def sinfo_structured(self) -> Sinfo: fieldnames = [f.name for f in fields(SinfoRow)] diff --git a/gcm/schemas/slurm/sdiag.py b/gcm/schemas/slurm/sdiag.py index 60aaecd..7703713 100644 --- a/gcm/schemas/slurm/sdiag.py +++ b/gcm/schemas/slurm/sdiag.py @@ -11,3 +11,27 @@ class Sdiag: agent_count: Optional[int] agent_thread_count: Optional[int] dbd_agent_queue_size: Optional[int] + + # Schedule cycle statistics + schedule_cycle_max: Optional[int] = None + schedule_cycle_mean: Optional[int] = None + schedule_cycle_sum: Optional[int] = None + schedule_cycle_total: Optional[int] = None + schedule_cycle_per_minute: Optional[int] = None + schedule_queue_length: Optional[int] = None + + # Job statistics (prefixed with sdiag_ to avoid collision with SLURMLog) + sdiag_jobs_submitted: Optional[int] = None + sdiag_jobs_started: Optional[int] = None + sdiag_jobs_completed: Optional[int] = None + sdiag_jobs_canceled: Optional[int] = None + sdiag_jobs_failed: Optional[int] = None + sdiag_jobs_pending: Optional[int] = None + sdiag_jobs_running: Optional[int] = None + + # Backfill statistics + bf_backfilled_jobs: Optional[int] = None + bf_cycle_mean: Optional[int] = None + bf_cycle_sum: Optional[int] = None + bf_cycle_max: Optional[int] = None + bf_queue_len: Optional[int] = None diff --git a/gcm/tests/data/sample-sdiag-output.json b/gcm/tests/data/sample-sdiag-output.json new file mode 100644 index 0000000..8e1ef3b --- /dev/null +++ b/gcm/tests/data/sample-sdiag-output.json @@ -0,0 +1,925 @@ + +{ + "statistics": { + "parts_packed": 1, + "req_time": { + "set": true, + "infinite": false, + "number": 1769652127 + }, + "req_time_start": { + "set": true, + "infinite": false, + "number": 1769650551 + }, + "server_thread_count": 4, + "agent_queue_size": 5, + "agent_count": 3, + "agent_thread_count": 8, + "dbd_agent_queue_size": 2, + "gettimeofday_latency": 26, + "schedule_cycle_max": 2788800, + "schedule_cycle_last": 2258381, + "schedule_cycle_sum": 582130236, + "schedule_cycle_total": 335, + "schedule_cycle_mean": 1737702, + "schedule_cycle_mean_depth": 28, + "schedule_cycle_per_minute": 12, + "schedule_cycle_depth": 9466, + "schedule_exit": { + "end_job_queue": 54, + "default_queue_depth": 0, + "max_job_start": 0, + "max_rpc_cnt": 0, + "max_sched_time": 281, + "licenses": 0 + }, + "schedule_queue_length": 407, + "jobs_submitted": 504, + "jobs_started": 579, + "jobs_completed": 524, + "jobs_canceled": 20, + "jobs_failed": 0, + "jobs_pending": 20725, + "jobs_running": 3273, + "job_states_ts": { + "set": true, + "infinite": false, + "number": 1769652102 + }, + "bf_backfilled_jobs": 287, + "bf_last_backfilled_jobs": 287, + "bf_backfilled_het_jobs": 0, + "bf_cycle_counter": 10, + "bf_cycle_mean": 37143463, + "bf_depth_mean": 448, + "bf_depth_mean_try": 230, + "bf_cycle_sum": 371434634, + "bf_cycle_last": 46552416, + "bf_cycle_max": 47125449, + "bf_exit": { + "end_job_queue": 10, + "bf_max_job_start": 0, + "bf_max_job_test": 0, + "bf_max_time": 0, + "bf_node_space_size": 0, + "state_changed": 0 + }, + "bf_last_depth": 353, + "bf_last_depth_try": 165, + "bf_depth_sum": 4480, + "bf_depth_try_sum": 2301, + "bf_queue_len": 411, + "bf_queue_len_mean": 453, + "bf_queue_len_sum": 4537, + "bf_table_size": 12, + "bf_table_size_sum": 104, + "bf_table_size_mean": 453, + "bf_when_last_cycle": { + "set": true, + "infinite": false, + "number": 1769652085 + }, + "bf_active": true, + "rpcs_by_message_type": [ + { + "type_id": 2049, + "message_type": "REQUEST_FED_INFO", + "count": 5510, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 466841, + "average_time": { + "set": true, + "infinite": false, + "number": 84 + } + }, + { + "type_id": 3001, + "message_type": "REQUEST_UPDATE_JOB", + "count": 198, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 57498589, + "average_time": { + "set": true, + "infinite": false, + "number": 290396 + } + }, + { + "type_id": 2009, + "message_type": "REQUEST_PARTITION_INFO", + "count": 4165, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 423385, + "average_time": { + "set": true, + "infinite": false, + "number": 101 + } + }, + { + "type_id": 2001, + "message_type": "REQUEST_BUILD_INFO", + "count": 119, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 46071705, + "average_time": { + "set": true, + "infinite": false, + "number": 387157 + } + }, + { + "type_id": 5005, + "message_type": "REQUEST_CANCEL_JOB_STEP", + "count": 33, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 10129846, + "average_time": { + "set": true, + "infinite": false, + "number": 306965 + } + }, + { + "type_id": 1008, + "message_type": "REQUEST_PING", + "count": 614, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 48074, + "average_time": { + "set": true, + "infinite": false, + "number": 78 + } + }, + { + "type_id": 2021, + "message_type": "REQUEST_JOB_INFO_SINGLE", + "count": 3049, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 1229351085, + "average_time": { + "set": true, + "infinite": false, + "number": 403198 + } + }, + { + "type_id": 6500, + "message_type": "REQUEST_PERSIST_INIT", + "count": 31, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 3718, + "average_time": { + "set": true, + "infinite": false, + "number": 119 + } + }, + { + "type_id": 2040, + "message_type": "REQUEST_NODE_INFO_SINGLE", + "count": 4683, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 571743838, + "average_time": { + "set": true, + "infinite": false, + "number": 122089 + } + }, + { + "type_id": 2039, + "message_type": "REQUEST_JOB_USER_INFO", + "count": 2453, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 408696689, + "average_time": { + "set": true, + "infinite": false, + "number": 166610 + } + }, + { + "type_id": 5018, + "message_type": "REQUEST_COMPLETE_BATCH_SCRIPT", + "count": 524, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 204218460, + "average_time": { + "set": true, + "infinite": false, + "number": 389729 + } + }, + { + "type_id": 5016, + "message_type": "REQUEST_STEP_COMPLETE", + "count": 767, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 149281139, + "average_time": { + "set": true, + "infinite": false, + "number": 194629 + } + }, + { + "type_id": 5023, + "message_type": "REQUEST_JOB_REQUEUE", + "count": 24, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 1044426, + "average_time": { + "set": true, + "infinite": false, + "number": 43517 + } + }, + { + "type_id": 6012, + "message_type": "MESSAGE_EPILOG_COMPLETE", + "count": 589, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 74296558, + "average_time": { + "set": true, + "infinite": false, + "number": 126140 + } + }, + { + "type_id": 2007, + "message_type": "REQUEST_NODE_INFO", + "count": 2985, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 638923002, + "average_time": { + "set": true, + "infinite": false, + "number": 214044 + } + }, + { + "type_id": 1002, + "message_type": "MESSAGE_NODE_REGISTRATION_STATUS", + "count": 5872, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 340069073, + "average_time": { + "set": true, + "infinite": false, + "number": 57913 + } + }, + { + "type_id": 6018, + "message_type": "REQUEST_COMPLETE_PROLOG", + "count": 644, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 72557843, + "average_time": { + "set": true, + "infinite": false, + "number": 112667 + } + }, + { + "type_id": 2003, + "message_type": "REQUEST_JOB_INFO", + "count": 535, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 205329065, + "average_time": { + "set": true, + "infinite": false, + "number": 383792 + } + }, + { + "type_id": 2035, + "message_type": "REQUEST_STATS_INFO", + "count": 246, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 23673, + "average_time": { + "set": true, + "infinite": false, + "number": 96 + } + }, + { + "type_id": 4027, + "message_type": "REQUEST_HET_JOB_ALLOC_INFO", + "count": 236, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 90707386, + "average_time": { + "set": true, + "infinite": false, + "number": 384353 + } + }, + { + "type_id": 5001, + "message_type": "REQUEST_JOB_STEP_CREATE", + "count": 255, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 113836418, + "average_time": { + "set": true, + "infinite": false, + "number": 446417 + } + }, + { + "type_id": 5017, + "message_type": "REQUEST_COMPLETE_JOB_ALLOCATION", + "count": 14, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 5658245, + "average_time": { + "set": true, + "infinite": false, + "number": 404160 + } + }, + { + "type_id": 3002, + "message_type": "REQUEST_UPDATE_NODE", + "count": 5, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 3476, + "average_time": { + "set": true, + "infinite": false, + "number": 695 + } + }, + { + "type_id": 5032, + "message_type": "REQUEST_KILL_JOB", + "count": 20, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 13528813, + "average_time": { + "set": true, + "infinite": false, + "number": 676440 + } + }, + { + "type_id": 4001, + "message_type": "REQUEST_RESOURCE_ALLOCATION", + "count": 7, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 8476485, + "average_time": { + "set": true, + "infinite": false, + "number": 1210926 + } + }, + { + "type_id": 4012, + "message_type": "REQUEST_JOB_WILL_RUN", + "count": 5, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 6832810, + "average_time": { + "set": true, + "infinite": false, + "number": 1366562 + } + }, + { + "type_id": 4019, + "message_type": "REQUEST_JOB_READY", + "count": 28, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 13036751, + "average_time": { + "set": true, + "infinite": false, + "number": 465598 + } + }, + { + "type_id": 4003, + "message_type": "REQUEST_SUBMIT_BATCH_JOB", + "count": 40, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 24094071, + "average_time": { + "set": true, + "infinite": false, + "number": 602351 + } + }, + { + "type_id": 10001, + "message_type": "ACCOUNTING_UPDATE_MSG", + "count": 94, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 2286, + "average_time": { + "set": true, + "infinite": false, + "number": 24 + } + }, + { + "type_id": 3014, + "message_type": "REQUEST_DELETE_NODE", + "count": 35, + "queued": 0, + "dropped": 0, + "cycle_last": 0, + "cycle_max": 0, + "total_time": 31360369, + "average_time": { + "set": true, + "infinite": false, + "number": 896010 + } + } + ], + "rpcs_by_user": [ + { + "user_id": 0, + "user": "root", + "count": 22507, + "total_time": 2364143744, + "average_time": { + "set": true, + "infinite": false, + "number": 105040 + } + }, + { + "user_id": 705806, + "user": "guillemram", + "count": 1248, + "total_time": 263479187, + "average_time": { + "set": true, + "infinite": false, + "number": 211121 + } + }, + { + "user_id": 673564, + "user": "seitzer", + "count": 612, + "total_time": 115432095, + "average_time": { + "set": true, + "infinite": false, + "number": 188614 + } + }, + { + "user_id": 205509, + "user": "ruanrms", + "count": 63, + "total_time": 3470756, + "average_time": { + "set": true, + "infinite": false, + "number": 55091 + } + }, + { + "user_id": 717631, + "user": "rrmaura", + "count": 90, + "total_time": 15877823, + "average_time": { + "set": true, + "infinite": false, + "number": 176420 + } + }, + { + "user_id": 657536, + "user": "gaoyuezhou", + "count": 458, + "total_time": 63691863, + "average_time": { + "set": true, + "infinite": false, + "number": 139065 + } + }, + { + "user_id": 705451, + "user": "cyclin", + "count": 1730, + "total_time": 247078993, + "average_time": { + "set": true, + "infinite": false, + "number": 142820 + } + }, + { + "user_id": 696463, + "user": "shimon", + "count": 148, + "total_time": 27280053, + "average_time": { + "set": true, + "infinite": false, + "number": 184324 + } + }, + { + "user_id": 700291, + "user": "shashankrajput", + "count": 1, + "total_time": 146, + "average_time": { + "set": true, + "infinite": false, + "number": 146 + } + }, + { + "user_id": 660884, + "user": "rulin", + "count": 1635, + "total_time": 42202940, + "average_time": { + "set": true, + "infinite": false, + "number": 25812 + } + }, + { + "user_id": 401, + "user": "slurm", + "count": 253, + "total_time": 14225, + "average_time": { + "set": true, + "infinite": false, + "number": 56 + } + }, + { + "user_id": 30000000, + "user": "generatedunixname677578511809727", + "count": 100, + "total_time": 38830225, + "average_time": { + "set": true, + "infinite": false, + "number": 388302 + } + }, + { + "user_id": 710110, + "user": "nelliewu", + "count": 20, + "total_time": 1193380, + "average_time": { + "set": true, + "infinite": false, + "number": 59669 + } + }, + { + "user_id": 369025, + "user": "meghanam", + "count": 12, + "total_time": 7214027, + "average_time": { + "set": true, + "infinite": false, + "number": 601168 + } + }, + { + "user_id": 726327, + "user": "harleenhanspal", + "count": 36, + "total_time": 14953780, + "average_time": { + "set": true, + "infinite": false, + "number": 415382 + } + }, + { + "user_id": 434845, + "user": "xuanyizhao", + "count": 166, + "total_time": 37911197, + "average_time": { + "set": true, + "infinite": false, + "number": 228380 + } + }, + { + "user_id": 605412, + "user": "jcaudet", + "count": 400, + "total_time": 86316815, + "average_time": { + "set": true, + "infinite": false, + "number": 215792 + } + }, + { + "user_id": 701293, + "user": "zhai", + "count": 22, + "total_time": 6463414, + "average_time": { + "set": true, + "infinite": false, + "number": 293791 + } + }, + { + "user_id": 628377, + "user": "aliciagolden", + "count": 2, + "total_time": 282, + "average_time": { + "set": true, + "infinite": false, + "number": 141 + } + }, + { + "user_id": 710761, + "user": "zliu2346", + "count": 667, + "total_time": 166627089, + "average_time": { + "set": true, + "infinite": false, + "number": 249815 + } + }, + { + "user_id": 133511, + "user": "edinan", + "count": 28, + "total_time": 18070588, + "average_time": { + "set": true, + "infinite": false, + "number": 645378 + } + }, + { + "user_id": 717724, + "user": "chleow89", + "count": 90, + "total_time": 18596809, + "average_time": { + "set": true, + "infinite": false, + "number": 206631 + } + }, + { + "user_id": 563377, + "user": "liamrobbins", + "count": 488, + "total_time": 90969175, + "average_time": { + "set": true, + "infinite": false, + "number": 186412 + } + }, + { + "user_id": 139410, + "user": "ballasn", + "count": 21, + "total_time": 4774646, + "average_time": { + "set": true, + "infinite": false, + "number": 227364 + } + }, + { + "user_id": 718528, + "user": "lcherif", + "count": 2448, + "total_time": 513227787, + "average_time": { + "set": true, + "infinite": false, + "number": 209651 + } + }, + { + "user_id": 658431, + "user": "ahavens", + "count": 480, + "total_time": 144110176, + "average_time": { + "set": true, + "infinite": false, + "number": 300229 + } + }, + { + "user_id": 669117, + "user": "alnicolicioiu", + "count": 3, + "total_time": 337838, + "average_time": { + "set": true, + "infinite": false, + "number": 112612 + } + }, + { + "user_id": 674675, + "user": "charlesarnal", + "count": 3, + "total_time": 1806380, + "average_time": { + "set": true, + "infinite": false, + "number": 602126 + } + }, + { + "user_id": 129954, + "user": "shru", + "count": 23, + "total_time": 11488884, + "average_time": { + "set": true, + "infinite": false, + "number": 499516 + } + }, + { + "user_id": 707882, + "user": "balomari", + "count": 20, + "total_time": 9233327, + "average_time": { + "set": true, + "infinite": false, + "number": 461666 + } + }, + { + "user_id": 723290, + "user": "nikitaagw", + "count": 4, + "total_time": 1826248, + "average_time": { + "set": true, + "infinite": false, + "number": 456562 + } + }, + { + "user_id": 570702, + "user": "luccab", + "count": 2, + "total_time": 1090227, + "average_time": { + "set": true, + "infinite": false, + "number": 545113 + } + } + ], + "pending_rpcs": [ + ], + "pending_rpcs_by_hostlist": [ + ] + }, + "meta": { + "plugin": { + "type": "", + "name": "", + "data_parser": "data_parser\/v0.0.42", + "accounting_storage": "accounting_storage\/slurmdbd" + }, + "client": { + "source": "\/dev\/pts\/0", + "user": "yongl", + "group": "yongl" + }, + "command": [ + "sdiag", + "--all" + ], + "slurm": { + "version": { + "major": "24", + "micro": "5", + "minor": "11" + }, + "release": "24.11.5", + "cluster": "fair-sc" + } + }, + "errors": [ + ], + "warnings": [ + ] +} diff --git a/gcm/tests/test_slurm.py b/gcm/tests/test_slurm.py index 8b2c2bc..91345b6 100644 --- a/gcm/tests/test_slurm.py +++ b/gcm/tests/test_slurm.py @@ -1,10 +1,11 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +import json import logging import subprocess from functools import partial from importlib import resources -from unittest.mock import create_autospec, MagicMock +from unittest.mock import create_autospec, MagicMock, patch import pytest from gcm.monitoring.clock import time_to_time_aware @@ -12,6 +13,7 @@ from gcm.monitoring.slurm.derived_cluster import get_derived_cluster +from gcm.schemas.slurm.sdiag import Sdiag from gcm.schemas.slurm.sinfo import Sinfo from gcm.schemas.slurm.sinfo_node import SinfoNode from gcm.schemas.slurm.squeue import JobData @@ -464,3 +466,108 @@ def test_sinfo_structured(dataset: str, expected: Sinfo) -> None: actual = c.sinfo_structured() assert actual == expected + + @staticmethod + @patch.object(SlurmCliClient, "_reset_sdiag_counters") + @patch("clusterscope.slurm_version") + @patch("subprocess.check_output") + def test_parse_sdiag_json( + mock_check_output: MagicMock, + mock_slurm_version: MagicMock, + mock_reset: MagicMock, + ) -> None: + mock_slurm_version.return_value = (23, 2) + + with resources.open_text(data, "sample-sdiag-output.json") as f: + mock_check_output.return_value = f.read() + + c = SlurmCliClient() + result = c.sdiag_structured() + + expected = Sdiag( + server_thread_count=4, + agent_queue_size=5, + agent_count=3, + agent_thread_count=8, + dbd_agent_queue_size=2, + schedule_cycle_max=2788800, + schedule_cycle_mean=1737702, + schedule_cycle_sum=582130236, + schedule_cycle_total=335, + schedule_cycle_per_minute=12, + schedule_queue_length=407, + sdiag_jobs_submitted=504, + sdiag_jobs_started=579, + sdiag_jobs_completed=524, + sdiag_jobs_canceled=20, + sdiag_jobs_failed=0, + sdiag_jobs_pending=20725, + sdiag_jobs_running=3273, + bf_backfilled_jobs=287, + bf_cycle_mean=37143463, + bf_cycle_sum=371434634, + bf_cycle_max=47125449, + bf_queue_len=411, + ) + + assert result == expected + mock_check_output.assert_called_once_with( + ["sdiag", "--all", "--json"], text=True + ) + mock_reset.assert_called_once() + + @staticmethod + @patch.object(SlurmCliClient, "_reset_sdiag_counters") + @patch("clusterscope.slurm_version") + @patch("subprocess.check_output") + def test_parse_sdiag_json_with_missing_fields( + mock_check_output: MagicMock, + mock_slurm_version: MagicMock, + mock_reset: MagicMock, + ) -> None: + mock_slurm_version.return_value = (23, 2) + + minimal_json = json.dumps( + { + "statistics": { + "server_thread_count": 10, + "agent_queue_size": 5, + "agent_count": 3, + "agent_thread_count": 8, + "dbd_agent_queue_size": 2, + } + } + ) + mock_check_output.return_value = minimal_json + + c = SlurmCliClient() + result = c.sdiag_structured() + + expected = Sdiag( + server_thread_count=10, + agent_queue_size=5, + agent_count=3, + agent_thread_count=8, + dbd_agent_queue_size=2, + schedule_cycle_max=None, + schedule_cycle_mean=None, + schedule_cycle_sum=None, + schedule_cycle_total=None, + schedule_cycle_per_minute=None, + schedule_queue_length=None, + sdiag_jobs_submitted=None, + sdiag_jobs_started=None, + sdiag_jobs_completed=None, + sdiag_jobs_canceled=None, + sdiag_jobs_failed=None, + sdiag_jobs_pending=None, + sdiag_jobs_running=None, + bf_backfilled_jobs=None, + bf_cycle_mean=None, + bf_cycle_sum=None, + bf_cycle_max=None, + bf_queue_len=None, + ) + + assert result == expected + mock_reset.assert_called_once() From 74657fe4b2d200fdae2d694e12e6236b449e432a Mon Sep 17 00:00:00 2001 From: yongl user Date: Thu, 5 Feb 2026 17:21:32 +0000 Subject: [PATCH 4/4] Update json for test --- gcm/tests/data/sample-sdiag-output.json | 335 +----------------------- 1 file changed, 8 insertions(+), 327 deletions(-) diff --git a/gcm/tests/data/sample-sdiag-output.json b/gcm/tests/data/sample-sdiag-output.json index 8e1ef3b..bac1ef6 100644 --- a/gcm/tests/data/sample-sdiag-output.json +++ b/gcm/tests/data/sample-sdiag-output.json @@ -536,7 +536,7 @@ "rpcs_by_user": [ { "user_id": 0, - "user": "root", + "user": "a", "count": 22507, "total_time": 2364143744, "average_time": { @@ -546,8 +546,8 @@ } }, { - "user_id": 705806, - "user": "guillemram", + "user_id": 1, + "user": "b", "count": 1248, "total_time": 263479187, "average_time": { @@ -557,327 +557,8 @@ } }, { - "user_id": 673564, - "user": "seitzer", - "count": 612, - "total_time": 115432095, - "average_time": { - "set": true, - "infinite": false, - "number": 188614 - } - }, - { - "user_id": 205509, - "user": "ruanrms", - "count": 63, - "total_time": 3470756, - "average_time": { - "set": true, - "infinite": false, - "number": 55091 - } - }, - { - "user_id": 717631, - "user": "rrmaura", - "count": 90, - "total_time": 15877823, - "average_time": { - "set": true, - "infinite": false, - "number": 176420 - } - }, - { - "user_id": 657536, - "user": "gaoyuezhou", - "count": 458, - "total_time": 63691863, - "average_time": { - "set": true, - "infinite": false, - "number": 139065 - } - }, - { - "user_id": 705451, - "user": "cyclin", - "count": 1730, - "total_time": 247078993, - "average_time": { - "set": true, - "infinite": false, - "number": 142820 - } - }, - { - "user_id": 696463, - "user": "shimon", - "count": 148, - "total_time": 27280053, - "average_time": { - "set": true, - "infinite": false, - "number": 184324 - } - }, - { - "user_id": 700291, - "user": "shashankrajput", - "count": 1, - "total_time": 146, - "average_time": { - "set": true, - "infinite": false, - "number": 146 - } - }, - { - "user_id": 660884, - "user": "rulin", - "count": 1635, - "total_time": 42202940, - "average_time": { - "set": true, - "infinite": false, - "number": 25812 - } - }, - { - "user_id": 401, - "user": "slurm", - "count": 253, - "total_time": 14225, - "average_time": { - "set": true, - "infinite": false, - "number": 56 - } - }, - { - "user_id": 30000000, - "user": "generatedunixname677578511809727", - "count": 100, - "total_time": 38830225, - "average_time": { - "set": true, - "infinite": false, - "number": 388302 - } - }, - { - "user_id": 710110, - "user": "nelliewu", - "count": 20, - "total_time": 1193380, - "average_time": { - "set": true, - "infinite": false, - "number": 59669 - } - }, - { - "user_id": 369025, - "user": "meghanam", - "count": 12, - "total_time": 7214027, - "average_time": { - "set": true, - "infinite": false, - "number": 601168 - } - }, - { - "user_id": 726327, - "user": "harleenhanspal", - "count": 36, - "total_time": 14953780, - "average_time": { - "set": true, - "infinite": false, - "number": 415382 - } - }, - { - "user_id": 434845, - "user": "xuanyizhao", - "count": 166, - "total_time": 37911197, - "average_time": { - "set": true, - "infinite": false, - "number": 228380 - } - }, - { - "user_id": 605412, - "user": "jcaudet", - "count": 400, - "total_time": 86316815, - "average_time": { - "set": true, - "infinite": false, - "number": 215792 - } - }, - { - "user_id": 701293, - "user": "zhai", - "count": 22, - "total_time": 6463414, - "average_time": { - "set": true, - "infinite": false, - "number": 293791 - } - }, - { - "user_id": 628377, - "user": "aliciagolden", - "count": 2, - "total_time": 282, - "average_time": { - "set": true, - "infinite": false, - "number": 141 - } - }, - { - "user_id": 710761, - "user": "zliu2346", - "count": 667, - "total_time": 166627089, - "average_time": { - "set": true, - "infinite": false, - "number": 249815 - } - }, - { - "user_id": 133511, - "user": "edinan", - "count": 28, - "total_time": 18070588, - "average_time": { - "set": true, - "infinite": false, - "number": 645378 - } - }, - { - "user_id": 717724, - "user": "chleow89", - "count": 90, - "total_time": 18596809, - "average_time": { - "set": true, - "infinite": false, - "number": 206631 - } - }, - { - "user_id": 563377, - "user": "liamrobbins", - "count": 488, - "total_time": 90969175, - "average_time": { - "set": true, - "infinite": false, - "number": 186412 - } - }, - { - "user_id": 139410, - "user": "ballasn", - "count": 21, - "total_time": 4774646, - "average_time": { - "set": true, - "infinite": false, - "number": 227364 - } - }, - { - "user_id": 718528, - "user": "lcherif", - "count": 2448, - "total_time": 513227787, - "average_time": { - "set": true, - "infinite": false, - "number": 209651 - } - }, - { - "user_id": 658431, - "user": "ahavens", - "count": 480, - "total_time": 144110176, - "average_time": { - "set": true, - "infinite": false, - "number": 300229 - } - }, - { - "user_id": 669117, - "user": "alnicolicioiu", - "count": 3, - "total_time": 337838, - "average_time": { - "set": true, - "infinite": false, - "number": 112612 - } - }, - { - "user_id": 674675, - "user": "charlesarnal", - "count": 3, - "total_time": 1806380, - "average_time": { - "set": true, - "infinite": false, - "number": 602126 - } - }, - { - "user_id": 129954, - "user": "shru", - "count": 23, - "total_time": 11488884, - "average_time": { - "set": true, - "infinite": false, - "number": 499516 - } - }, - { - "user_id": 707882, - "user": "balomari", - "count": 20, - "total_time": 9233327, - "average_time": { - "set": true, - "infinite": false, - "number": 461666 - } - }, - { - "user_id": 723290, - "user": "nikitaagw", - "count": 4, - "total_time": 1826248, - "average_time": { - "set": true, - "infinite": false, - "number": 456562 - } - }, - { - "user_id": 570702, - "user": "luccab", + "user_id": 2, + "user": "c", "count": 2, "total_time": 1090227, "average_time": { @@ -901,8 +582,8 @@ }, "client": { "source": "\/dev\/pts\/0", - "user": "yongl", - "group": "yongl" + "user": "x", + "group": "x" }, "command": [ "sdiag", @@ -915,7 +596,7 @@ "minor": "11" }, "release": "24.11.5", - "cluster": "fair-sc" + "cluster": "test-cluster" } }, "errors": [