diff --git a/ansible/library/load_extra_dpu_config.py b/ansible/library/load_extra_dpu_config.py index 06189210b1..deceb372cc 100644 --- a/ansible/library/load_extra_dpu_config.py +++ b/ansible/library/load_extra_dpu_config.py @@ -1,9 +1,9 @@ #!/usr/bin/python - import paramiko import os import time from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.misc_utils import wait_for_path from ansible.module_utils.smartswitch_utils import smartswitch_hwsku_config DPU_HOST_IP_BASE = "169.254.200.{}" @@ -20,7 +20,7 @@ RETRY_DELAY = 60 # sec # Set to 1.0 for requiring all DPUs to succeed (after HW issues are resolved) -SUCCESS_THRESHOLD = 0.5 +SUCCESS_THRESHOLD = 1.0 class LoadExtraDpuConfigModule(object): @@ -47,6 +47,13 @@ def __init__(self): except KeyError: self.module.fail_json(msg="No DPU configuration found for hwsku: {}".format(self.hwsku)) + def wait_for_dpu_path(self, ssh, dpu_ip, path_to_check): + try: + wait_for_path(ssh, dpu_ip, path_to_check, empty_ok=False, tries=MAX_RETRIES, delay=RETRY_DELAY) + except FileNotFoundError: + return False + return True + def connect_to_dpu(self, dpu_ip): """Establish an SSH connection to the DPU with retry""" retry_count = 0 @@ -109,10 +116,20 @@ def configure_dpus(self): if SUCCESS_THRESHOLD < 1.0 and required_success_count == 0: required_success_count = 1 + # Wait for the DPU control plane to be up for at least required_success_count DPUs + if not self.wait_for_dpu_count_mid_plane_up(required_success_count): + self.module.warn( + "DPU control planes are not ready on switch (required {}) after {} retries." + .format(required_success_count, MAX_RETRIES) + ) + self.module.log("Configuring {} DPUs, requiring at least {} successful configurations".format( self.dpu_num, required_success_count)) for i in range(0, self.dpu_num): + # Update the extra dpu config file + self.module.run_command("sed -i 's/18.*202/18.{}.202/g' {}".format(i, SRC_DPU_CONFIG_FILE)) + dpu_ip = DPU_HOST_IP_BASE.format(i + 1) self.module.log("Attempting to configure DPU {} at {}".format(i + 1, dpu_ip)) @@ -126,6 +143,7 @@ def configure_dpus(self): try: # Attempt each step and track success if (self.transfer_to_dpu(ssh, dpu_ip) and + self.wait_for_dpu_path(ssh, dpu_ip, DEFAULT_CONFIG_FILE) and self.execute_command(ssh, dpu_ip, GEN_FULL_CONFIG_CMD) and self.execute_command(ssh, dpu_ip, CONFIG_RELOAD_CMD) and self.execute_command(ssh, dpu_ip, CONFIG_SAVE_CMD) and @@ -155,8 +173,67 @@ def configure_dpus(self): "Failures: {}".format( required_success_count, success_count, self.dpu_num, failure_count)) + self.module.log("Checking the number of DPUs fully online") + if not self.wait_for_dpu_count_fully_online(required_success_count): + self.module.fail_json( + msg="DPUs are not fully online (required {}) after {} retries.".format( + required_success_count, + MAX_RETRIES, + ) + ) + return success_count, failure_count + def get_dpu_online_mid_plane_up_counts(self): + # returns the number of DPUs fully online and midplane up count + # root@mtvr-bobcat-03:~# show system-health dpu all + # Name Oper-Status State-Detail State-Value Time Reason + # ------ -------------- ----------------------- ------------- ------------------------------- -------- + # DPU0 Partial Online dpu_midplane_link_state up Wed Nov 19 03:00:35 AM UTC 2025 + # dpu_control_plane_state down Wed Nov 19 03:15:58 AM UTC 2025 + # dpu_data_plane_state down Wed Nov 19 03:15:58 AM UTC 2025 + # DPU1 Online dpu_midplane_link_state up Wed Nov 19 03:00:55 AM UTC 2025 + # dpu_control_plane_state up Wed Nov 19 03:02:12 AM UTC 2025 + # dpu_data_plane_state up Wed Nov 19 03:02:12 AM UTC 2025 + # DPU2 Online dpu_midplane_link_state up Wed Nov 19 03:00:55 AM UTC 2025 + # dpu_control_plane_state up Wed Nov 19 03:02:20 AM UTC 2025 + # dpu_data_plane_state up Wed Nov 19 03:02:20 AM UTC 2025 + # DPU3 Online dpu_midplane_link_state up Wed Nov 19 03:00:25 AM UTC 2025 + # dpu_control_plane_state up Wed Nov 19 03:01:46 AM UTC 2025 + # dpu_data_plane_state up Wed Nov 19 03:01:46 AM UTC 2025 + cmd = "show system-health dpu all" + rc, out, err = self.module.run_command(cmd) + if rc != 0: + self.module.warn("Failed to execute DPU system health command: " + "cmd: {}, rc: {}, Err: {}, Out: {}".format(cmd, rc, err, out)) + return 0, 0 + dpu_online_count = 0 + dpu_midplane_up_count = 0 + for line in out.split("\n"): + if "up" in line and "dpu_midplane_link_state" in line: + dpu_midplane_up_count += 1 + if line.startswith("DPU") and "Online" in line and "Partial" not in line: + dpu_online_count += 1 + return dpu_online_count, dpu_midplane_up_count + + def wait_for_dpu_count_mid_plane_up(self, required_count): + retry_count = 0 + while retry_count < MAX_RETRIES: + if self.get_dpu_online_mid_plane_up_counts()[1] >= required_count: + return True + time.sleep(RETRY_DELAY) + retry_count += 1 + return False + + def wait_for_dpu_count_fully_online(self, required_count): + retry_count = 0 + while retry_count < MAX_RETRIES: + if self.get_dpu_online_mid_plane_up_counts()[0] >= required_count: + return True + time.sleep(RETRY_DELAY) + retry_count += 1 + return False + def run(self): success_count, failure_count = self.configure_dpus()