Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 79 additions & 2 deletions ansible/library/load_extra_dpu_config.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#!/usr/bin/python

import paramiko
import os
import time
from ansible.module_utils.basic import AnsibleModule
from ansible.module_utils.misc_utils import wait_for_path
from ansible.module_utils.smartswitch_utils import smartswitch_hwsku_config

DPU_HOST_IP_BASE = "169.254.200.{}"
Expand All @@ -20,7 +20,7 @@
RETRY_DELAY = 60 # sec

# Set to 1.0 for requiring all DPUs to succeed (after HW issues are resolved)
SUCCESS_THRESHOLD = 0.5
SUCCESS_THRESHOLD = 1.0


class LoadExtraDpuConfigModule(object):
Expand All @@ -47,6 +47,13 @@ def __init__(self):
except KeyError:
self.module.fail_json(msg="No DPU configuration found for hwsku: {}".format(self.hwsku))

def wait_for_dpu_path(self, ssh, dpu_ip, path_to_check):
try:
wait_for_path(ssh, dpu_ip, path_to_check, empty_ok=False, tries=MAX_RETRIES, delay=RETRY_DELAY)
except FileNotFoundError:
return False
return True

def connect_to_dpu(self, dpu_ip):
"""Establish an SSH connection to the DPU with retry"""
retry_count = 0
Expand Down Expand Up @@ -109,10 +116,20 @@ def configure_dpus(self):
if SUCCESS_THRESHOLD < 1.0 and required_success_count == 0:
required_success_count = 1

# Wait for the DPU control plane to be up for at least required_success_count DPUs
if not self.wait_for_dpu_count_mid_plane_up(required_success_count):
self.module.warn(
"DPU control planes are not ready on switch (required {}) after {} retries."
.format(required_success_count, MAX_RETRIES)
)

self.module.log("Configuring {} DPUs, requiring at least {} successful configurations".format(
self.dpu_num, required_success_count))

for i in range(0, self.dpu_num):
# Update the extra dpu config file
self.module.run_command("sed -i 's/18.*202/18.{}.202/g' {}".format(i, SRC_DPU_CONFIG_FILE))

dpu_ip = DPU_HOST_IP_BASE.format(i + 1)

self.module.log("Attempting to configure DPU {} at {}".format(i + 1, dpu_ip))
Expand All @@ -126,6 +143,7 @@ def configure_dpus(self):
try:
# Attempt each step and track success
if (self.transfer_to_dpu(ssh, dpu_ip) and
self.wait_for_dpu_path(ssh, dpu_ip, DEFAULT_CONFIG_FILE) and
self.execute_command(ssh, dpu_ip, GEN_FULL_CONFIG_CMD) and
self.execute_command(ssh, dpu_ip, CONFIG_RELOAD_CMD) and
self.execute_command(ssh, dpu_ip, CONFIG_SAVE_CMD) and
Expand Down Expand Up @@ -155,8 +173,67 @@ def configure_dpus(self):
"Failures: {}".format(
required_success_count, success_count, self.dpu_num, failure_count))

self.module.log("Checking the number of DPUs fully online")
if not self.wait_for_dpu_count_fully_online(required_success_count):
self.module.fail_json(
msg="DPUs are not fully online (required {}) after {} retries.".format(
required_success_count,
MAX_RETRIES,
)
)

return success_count, failure_count

def get_dpu_online_mid_plane_up_counts(self):
# returns the number of DPUs fully online and midplane up count
# root@mtvr-bobcat-03:~# show system-health dpu all
# Name Oper-Status State-Detail State-Value Time Reason
# ------ -------------- ----------------------- ------------- ------------------------------- --------
# DPU0 Partial Online dpu_midplane_link_state up Wed Nov 19 03:00:35 AM UTC 2025
# dpu_control_plane_state down Wed Nov 19 03:15:58 AM UTC 2025
# dpu_data_plane_state down Wed Nov 19 03:15:58 AM UTC 2025
# DPU1 Online dpu_midplane_link_state up Wed Nov 19 03:00:55 AM UTC 2025
# dpu_control_plane_state up Wed Nov 19 03:02:12 AM UTC 2025
# dpu_data_plane_state up Wed Nov 19 03:02:12 AM UTC 2025
# DPU2 Online dpu_midplane_link_state up Wed Nov 19 03:00:55 AM UTC 2025
# dpu_control_plane_state up Wed Nov 19 03:02:20 AM UTC 2025
# dpu_data_plane_state up Wed Nov 19 03:02:20 AM UTC 2025
# DPU3 Online dpu_midplane_link_state up Wed Nov 19 03:00:25 AM UTC 2025
# dpu_control_plane_state up Wed Nov 19 03:01:46 AM UTC 2025
# dpu_data_plane_state up Wed Nov 19 03:01:46 AM UTC 2025
cmd = "show system-health dpu all"
rc, out, err = self.module.run_command(cmd)
if rc != 0:
self.module.warn("Failed to execute DPU system health command: "
"cmd: {}, rc: {}, Err: {}, Out: {}".format(cmd, rc, err, out))
return 0, 0
dpu_online_count = 0
dpu_midplane_up_count = 0
for line in out.split("\n"):
if "up" in line and "dpu_midplane_link_state" in line:
dpu_midplane_up_count += 1
if line.startswith("DPU") and "Online" in line and "Partial" not in line:
dpu_online_count += 1
return dpu_online_count, dpu_midplane_up_count

def wait_for_dpu_count_mid_plane_up(self, required_count):
retry_count = 0
while retry_count < MAX_RETRIES:
if self.get_dpu_online_mid_plane_up_counts()[1] >= required_count:
return True
time.sleep(RETRY_DELAY)
retry_count += 1
return False

def wait_for_dpu_count_fully_online(self, required_count):
retry_count = 0
while retry_count < MAX_RETRIES:
if self.get_dpu_online_mid_plane_up_counts()[0] >= required_count:
return True
time.sleep(RETRY_DELAY)
retry_count += 1
return False

def run(self):
success_count, failure_count = self.configure_dpus()

Expand Down