From baf7f133d1376846a5322f9eeff2f99d2877f80e Mon Sep 17 00:00:00 2001 From: Sonic Build Admin Date: Mon, 12 Jan 2026 17:56:08 +0000 Subject: [PATCH] Added the DPU midplane precheck and online postcheck for the minigraph ### Description of PR DPU control plane up wait for DPU minigraph Summary: Before the DPU config is deployed, DPUs need to start with the DPU mid plane up and default config available. Also the DPUs need to be fully online after the configuration. ### Type of change - [ ] Bug fix - [X] Testbed and Framework(new/improvement) - [ ] New Test case - [ ] Skipped for non-supported platforms - [ ] Test case improvement ### Back port request - [ ] 202205 - [ ] 202305 - [ ] 202311 - [ ] 202405 - [ ] 202411 - [X] 202505 ### Approach #### What is the motivation for this PR? Random smartswitch minigraph deployment failures #### How did you do it? Added the wait for the DPU mid plane up and the default config file. Added the post configuration check for the DPUs to be online. #### How did you verify/test it? Smartswitch light mode deployment #### Any platform specific information? Smartswitch light mode #### Supported testbed topology if it's a new test case? ### Documentation --- ansible/library/load_extra_dpu_config.py | 81 +++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/ansible/library/load_extra_dpu_config.py b/ansible/library/load_extra_dpu_config.py index 06189210b1..deceb372cc 100644 --- a/ansible/library/load_extra_dpu_config.py +++ b/ansible/library/load_extra_dpu_config.py @@ -1,9 +1,9 @@ #!/usr/bin/python - import paramiko import os import time from ansible.module_utils.basic import AnsibleModule +from ansible.module_utils.misc_utils import wait_for_path from ansible.module_utils.smartswitch_utils import smartswitch_hwsku_config DPU_HOST_IP_BASE = "169.254.200.{}" @@ -20,7 +20,7 @@ RETRY_DELAY = 60 # sec # Set to 1.0 for requiring all DPUs to succeed (after HW issues are resolved) -SUCCESS_THRESHOLD = 0.5 +SUCCESS_THRESHOLD = 1.0 class LoadExtraDpuConfigModule(object): @@ -47,6 +47,13 @@ def __init__(self): except KeyError: self.module.fail_json(msg="No DPU configuration found for hwsku: {}".format(self.hwsku)) + def wait_for_dpu_path(self, ssh, dpu_ip, path_to_check): + try: + wait_for_path(ssh, dpu_ip, path_to_check, empty_ok=False, tries=MAX_RETRIES, delay=RETRY_DELAY) + except FileNotFoundError: + return False + return True + def connect_to_dpu(self, dpu_ip): """Establish an SSH connection to the DPU with retry""" retry_count = 0 @@ -109,10 +116,20 @@ def configure_dpus(self): if SUCCESS_THRESHOLD < 1.0 and required_success_count == 0: required_success_count = 1 + # Wait for the DPU control plane to be up for at least required_success_count DPUs + if not self.wait_for_dpu_count_mid_plane_up(required_success_count): + self.module.warn( + "DPU control planes are not ready on switch (required {}) after {} retries." + .format(required_success_count, MAX_RETRIES) + ) + self.module.log("Configuring {} DPUs, requiring at least {} successful configurations".format( self.dpu_num, required_success_count)) for i in range(0, self.dpu_num): + # Update the extra dpu config file + self.module.run_command("sed -i 's/18.*202/18.{}.202/g' {}".format(i, SRC_DPU_CONFIG_FILE)) + dpu_ip = DPU_HOST_IP_BASE.format(i + 1) self.module.log("Attempting to configure DPU {} at {}".format(i + 1, dpu_ip)) @@ -126,6 +143,7 @@ def configure_dpus(self): try: # Attempt each step and track success if (self.transfer_to_dpu(ssh, dpu_ip) and + self.wait_for_dpu_path(ssh, dpu_ip, DEFAULT_CONFIG_FILE) and self.execute_command(ssh, dpu_ip, GEN_FULL_CONFIG_CMD) and self.execute_command(ssh, dpu_ip, CONFIG_RELOAD_CMD) and self.execute_command(ssh, dpu_ip, CONFIG_SAVE_CMD) and @@ -155,8 +173,67 @@ def configure_dpus(self): "Failures: {}".format( required_success_count, success_count, self.dpu_num, failure_count)) + self.module.log("Checking the number of DPUs fully online") + if not self.wait_for_dpu_count_fully_online(required_success_count): + self.module.fail_json( + msg="DPUs are not fully online (required {}) after {} retries.".format( + required_success_count, + MAX_RETRIES, + ) + ) + return success_count, failure_count + def get_dpu_online_mid_plane_up_counts(self): + # returns the number of DPUs fully online and midplane up count + # root@mtvr-bobcat-03:~# show system-health dpu all + # Name Oper-Status State-Detail State-Value Time Reason + # ------ -------------- ----------------------- ------------- ------------------------------- -------- + # DPU0 Partial Online dpu_midplane_link_state up Wed Nov 19 03:00:35 AM UTC 2025 + # dpu_control_plane_state down Wed Nov 19 03:15:58 AM UTC 2025 + # dpu_data_plane_state down Wed Nov 19 03:15:58 AM UTC 2025 + # DPU1 Online dpu_midplane_link_state up Wed Nov 19 03:00:55 AM UTC 2025 + # dpu_control_plane_state up Wed Nov 19 03:02:12 AM UTC 2025 + # dpu_data_plane_state up Wed Nov 19 03:02:12 AM UTC 2025 + # DPU2 Online dpu_midplane_link_state up Wed Nov 19 03:00:55 AM UTC 2025 + # dpu_control_plane_state up Wed Nov 19 03:02:20 AM UTC 2025 + # dpu_data_plane_state up Wed Nov 19 03:02:20 AM UTC 2025 + # DPU3 Online dpu_midplane_link_state up Wed Nov 19 03:00:25 AM UTC 2025 + # dpu_control_plane_state up Wed Nov 19 03:01:46 AM UTC 2025 + # dpu_data_plane_state up Wed Nov 19 03:01:46 AM UTC 2025 + cmd = "show system-health dpu all" + rc, out, err = self.module.run_command(cmd) + if rc != 0: + self.module.warn("Failed to execute DPU system health command: " + "cmd: {}, rc: {}, Err: {}, Out: {}".format(cmd, rc, err, out)) + return 0, 0 + dpu_online_count = 0 + dpu_midplane_up_count = 0 + for line in out.split("\n"): + if "up" in line and "dpu_midplane_link_state" in line: + dpu_midplane_up_count += 1 + if line.startswith("DPU") and "Online" in line and "Partial" not in line: + dpu_online_count += 1 + return dpu_online_count, dpu_midplane_up_count + + def wait_for_dpu_count_mid_plane_up(self, required_count): + retry_count = 0 + while retry_count < MAX_RETRIES: + if self.get_dpu_online_mid_plane_up_counts()[1] >= required_count: + return True + time.sleep(RETRY_DELAY) + retry_count += 1 + return False + + def wait_for_dpu_count_fully_online(self, required_count): + retry_count = 0 + while retry_count < MAX_RETRIES: + if self.get_dpu_online_mid_plane_up_counts()[0] >= required_count: + return True + time.sleep(RETRY_DELAY) + retry_count += 1 + return False + def run(self): success_count, failure_count = self.configure_dpus()