Skip to content

Commit

Permalink
Review comment fixes and added dpu power APIs
Browse files Browse the repository at this point in the history
  • Loading branch information
gpunathilell committed May 1, 2024
1 parent 5a57f70 commit 8ef9836
Show file tree
Hide file tree
Showing 6 changed files with 509 additions and 19 deletions.
27 changes: 12 additions & 15 deletions platform/mellanox/mlnx-platform-api/sonic_platform/chassis.py
Original file line number Diff line number Diff line change
Expand Up @@ -1036,6 +1036,7 @@ class SmartSwitchChassis(Chassis):
def __init__(self):
super(SmartSwitchChassis, self).__init__()
self.module_initialized_count = 0
self.module_name_index_map = {}
self.initialize_modules()

def is_modular_chassis(self):
Expand Down Expand Up @@ -1064,21 +1065,13 @@ def initialize_single_module(self, index):
if not self._module_list[index]:
from .module import DpuModule
self._module_list[index] = DpuModule(index + 1)
self.module_name_index_map[DpuModule.get_name()] = index
self.module_initialized_count += 1

def initialize_modules(self):
if not self._module_list:
from .module import DpuModule
count = self.get_num_modules()
for index in range(1, count + 1):
self._module_list.append(DpuModule(index))
self.module_initialized_count = count
elif self.module_initialized_count != len(self._module_list):
from .module import DpuModule
for index in range(len(self._module_list)):
if self._module_list[index] is None:
self._module_list[index] = DpuModule(index + 1)
self.module_initialized_count = len(self._module_list)
count = self.get_num_modules()
for index in range(count):
self.initialize_single_module(index=index)

def get_num_modules(self):
"""
Expand Down Expand Up @@ -1119,7 +1112,6 @@ def get_module(self, index):
self.initialize_single_module(index)
return super(SmartSwitchChassis, self).get_module(index)

@utils.default_return(-1)
def get_module_index(self, module_name):
"""
Retrieves module index from the module name
Expand All @@ -1133,7 +1125,12 @@ def get_module_index(self, module_name):
An integer, the index of the ModuleBase object in the module_list
"""
# TODO: Confirm whether Switch is module or not
return int(module_name[len('DPU'):]) - 1
ret_val = -1
try:
ret_val = self.module_name_index_map[module_name]
except KeyError as e:
logger.log_error("Failed to obtain name to index mapping! Check module initialization {}".format(repr(e)))
return ret_val

##############################################
# SmartSwitch methods
Expand Down Expand Up @@ -1193,5 +1190,5 @@ def get_module_dpu_data_port(self, index):
return_string = "N/A"
if return_string == ":":
logger.log_error("Failed to obtain NPU-DPU Port Mapping:"
"Data Not present in PLatform.json")
"Data Not present in Platform.json")
return return_string
16 changes: 12 additions & 4 deletions platform/mellanox/mlnx-platform-api/sonic_platform/device_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import time

from . import utils
from sonic_py_common.logger import Logger

DEVICE_DATA = {
'x86_64-mlnx_msn2700-r0': {
Expand Down Expand Up @@ -138,6 +139,8 @@
}
}

# Global logger class instance
logger = Logger()

class DeviceDataManager:
@classmethod
Expand Down Expand Up @@ -263,10 +266,15 @@ def get_cpld_component_list(cls):

@classmethod
def get_platform_json_data(cls):
from sonic_py_common import device_info
platform_path = device_info.get_path_to_platform_dir()
platform_json_path = os.path.join(platform_path, 'platform.json')
return utils.load_json_file(platform_json_path)
retval = {}
try:
from sonic_py_common import device_info
platform_path = device_info.get_path_to_platform_dir()
platform_json_path = os.path.join(platform_path, 'platform.json')
retval = utils.load_json_file(platform_json_path)
except TypeError as e:
logger.log_error("Failed to obtain Platform.json file data")
return retval

@classmethod
@utils.read_only_cache()
Expand Down
219 changes: 219 additions & 0 deletions platform/mellanox/mlnx-platform-api/sonic_platform/dpuctl_hwm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,219 @@
"""Class Implementation for per DPU functionality"""
import os.path
import time

try:
from .inotify_helper import InotifyHelper
from sonic_py_common.logger import Logger
from sonic_platform import utils
except ImportError as e:
raise ImportError(str(e)) from e

EVENT_BASE = "/var/run/hw-management/events/"
SYSTEM_BASE = "/var/run/hw-management/system/"
CONFIG_BASE = "/var/run/hw-management/config"
logger = Logger()

WAIT_FOR_SHTDN = 60
WAIT_FOR_DPU_READY = 60


class DataWriter():
"""Class for writing data to files"""
def __init__(self, file_name):
self.file_name = file_name
self.file_obj = None
if not os.path.isfile(self.file_name):
raise FileNotFoundError(f"File {self.file_name} does not exist!")

def __enter__(self):
self.file_obj = open(self.file_name, 'w', encoding="utf-8")
return self.file_obj

def __exit__(self, *args):
self.file_obj.close()


class DpuCtlPlat():
"""Class for Per DPU API Call"""
def __init__(self, dpu_index):
self.index = dpu_index + 1
self._name = f"dpu{self.index}"
self.set_go_down_path = os.path.join(SYSTEM_BASE,
f"dpu{self.index}_rst")
self.set_pwr_path = os.path.join(SYSTEM_BASE,
f"dpu{self.index}_pwr")
self.set_pwr_f_path = os.path.join(SYSTEM_BASE,
f"dpu{self.index}_pwr_force")
self.get_dpu_rdy_path = os.path.join(EVENT_BASE,
f"dpu{self.index}_ready")
self.set_dpu_perst_en_path = os.path.join(SYSTEM_BASE,
f"dpu{self.index}_perst_en")

def write_file(self, file_name, content_towrite):
"""Write given value to file only if file exists"""
try:
with DataWriter(file_name) as file_obj:
file_obj.write(content_towrite)
except (ValueError,
IOError,
PermissionError,
FileNotFoundError) as file_write_exc:
logger.log_error(f'{self.get_name()}:Failed to write'
f'{content_towrite} to file {file_name}')
raise type(file_write_exc)(
f"{self.get_name()}:{str(file_write_exc)}")
return True

def get_name(self):
"""Return name of the DPU"""
return self._name

def dpu_go_down(self):
"""Per DPU going down API"""
get_shtdn_ready_path = os.path.join(EVENT_BASE,
f"dpu{self.index}_shtdn_ready")
try:
get_shtdn_inotify = InotifyHelper(get_shtdn_ready_path)
dpu_shtdn_rdy = get_shtdn_inotify.add_watch(WAIT_FOR_SHTDN, 1)
except (FileNotFoundError, PermissionError) as inotify_exc:
raise type(inotify_exc)(f"{self.get_name()}:{str(inotify_exc)}")
if dpu_shtdn_rdy is None:
print(f"{self.get_name()}: Going Down Unsuccessful")
self.dpu_power_off(forced=True)
self.dpu_power_on(forced=True)
return

def dpu_power_off(self, forced=False):
"""Per DPU Power off API"""
print(f"{self.get_name()}: Power off forced={forced}")
if forced:
self.write_file(self.set_pwr_f_path, "1")
else:
self.write_file(self.set_go_down_path, "1")
self.dpu_go_down()
self.write_file(self.set_pwr_path, "1")
print(f"{self.get_name()}: Power Off complete")

def dpu_power_on(self, forced=False, count=4):
"""Per DPU Power on API"""
if count < 4:
print(f"{self.get_name()}: Failed! Retry {4-count}..")
print(f"{self.get_name()}: Power on forced={forced}")
if forced:
self.write_file(self.set_pwr_f_path, "0")
else:
self.write_file(self.set_pwr_path, "0")
get_rdy_inotify = InotifyHelper(self.get_dpu_rdy_path)
dpu_rdy = get_rdy_inotify.add_watch(WAIT_FOR_DPU_READY, 1)
if not dpu_rdy:
if forced:
if count > 1:
time.sleep(1)
self.dpu_power_off(forced=True)
self.dpu_power_on(forced=True, count=count-1)
else:
print(f"{self.get_name()}: Failed Force power on! Exiting")
return False
else:
self.dpu_power_off(forced=True)
self.dpu_power_on(forced=True)
else:
print(f"{self.get_name()}: Power on Successful!")
return True

def dpu_reboot_prep(self):
"""Per DPU Reboot API"""
# TODO: Shutdown SONiC on DPU -> SSH Connection -> Shutdown script

def dpu_burn_fw(self, path):
"""Per DPU Firmware Update API"""
# TODO: Uncomment to install the bfb Image
"""if not os.path.isfile(path):
raise FileNotFoundError(f"{self.get_name()}:File "
f"{self.file_name} does not exist!")
cmd = ["sonic_bfb_install" ,"-b" ,path,"-r","rshim"+str(self.index)]
try:
cmd_output = subprocess.check_output(cmd)
except subprocess.CalledProcessError as cmd_exc:
print("Installation failed! code",
cmd_exc.returncode,
cmd_exc.output)"""

def dpu_pci_scan(self):
"""PCI Scan API"""
set_pci_scan = "/sys/bus/pci/rescan"
self.write_file(set_pci_scan, "1")

def dpu_pci_remove(self):
"""Per DPU PCI remove API"""
get_dpu_pci_path = os.path.join(CONFIG_BASE,
f"dpu{self.index}_pci_bus_id")
pci_string = utils.read_str_from_file(get_dpu_pci_path,
raise_exception=True)
get_pci_dev_path = "/sys/bus/pci/devices/"+pci_string+"/remove"
self.write_file(get_pci_dev_path, "1")

def dpu_fw_upgrade(self, path):
"""Per DPU Firmware Upgrade API"""
print(f"{self.get_name()}: FW upgrade")
self.dpu_burn_fw(path)
self.dpu_reboot_prep()
self.dpu_pci_remove()
self.write_file(self.set_dpu_perst_en_path, "0")
self.dpu_go_down()
self.write_file(self.set_dpu_perst_en_path, "1")
get_rdy_inotify = InotifyHelper(self.get_dpu_rdy_path)
dpu_rdy = get_rdy_inotify.add_watch(WAIT_FOR_DPU_READY, 1)
if not dpu_rdy:
self.dpu_power_off(forced=True)
self.dpu_power_on(forced=True)
self.dpu_pci_scan()
print(f"{self.get_name()}: FW upgrade complete")

def dpu_reboot(self):
"""Per DPU Reboot API"""
print(f"{self.get_name()}: Reboot")
self.dpu_reboot_prep()
self.dpu_pci_remove()
self.write_file(self.set_go_down_path, "1")
self.dpu_go_down()
self.write_file(self.set_go_down_path, "0")
get_rdy_inotify = InotifyHelper(self.get_dpu_rdy_path)
dpu_rdy = get_rdy_inotify.add_watch(WAIT_FOR_DPU_READY, 1)
if not dpu_rdy:
self.dpu_power_off(forced=True)
self.dpu_power_on(forced=True)
self.dpu_pci_scan()
print(f"{self.get_name()}: Reboot complete")

def call_dpu_fw_upgrade(obj, path):
"""Function to call object specific firmware update for each dpu"""
try:
obj.dpu_fw_upgrade(path)
except Exception as error:
print(f"An error occurred: {type(error).__name__} - {error}")


def call_dpu_reset(obj):
"""Function to call object specific Reset for each dpu"""
try:
obj.dpu_reboot()
except Exception as error:
print(f"An error occurred: {type(error).__name__} - {error}")


def call_dpu_power_on(obj, force):
"""Function to call object specific power on for each dpu"""
try:
obj.dpu_power_on(force)
except Exception as error:
print(f"An error occurred: {type(error).__name__} - {error}")


def call_dpu_power_off(obj, force):
"""Function to call object specific power off for each dpu"""
try:
obj.dpu_power_off(force)
except Exception as error:
print(f"An error occurred: {type(error).__name__} - {error}")
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""Helper code for Inotify Implementation for reading file until timeout"""
import os
import errno
import inotify.adapters

try:
from sonic_py_common.logger import Logger
from sonic_platform import utils
except ImportError as e:
raise ImportError(str(e) + '- required module not found') from e

logger = Logger()


class InotifyHelper():
"""Helper Code for Inotify Implmentation"""
def __init__(self, file_path):
self.file_path = file_path
self.inotify_obj = inotify.adapters.Inotify()
if not self.inotify_obj:
logger.log_error("INOTIFY adapter error!")
raise AssertionError("INOTIFY is not present!")
if not os.path.exists(self.file_path):
logger.log_error(f"{self.file_path} does not exist")
raise FileNotFoundError(errno.ENOENT,
os.strerror(errno.ENOENT),
self.file_path)

def add_watch(self, timeout, expected_value):
"""Waits for changes in file until specified time and
compares written value to expected value"""
self.inotify_obj.add_watch(self.file_path,
mask=inotify.constants.IN_CLOSE_WRITE)
for event in self.inotify_obj.event_gen(timeout_s=timeout,
yield_nones=False):
read_value = utils.read_int_from_file(self.file_path,
raise_exception=True)
if read_value == expected_value:
return read_value
read_value = utils.read_int_from_file(self.file_path,
raise_exception=True)
if read_value != expected_value:
return None
return read_value
30 changes: 30 additions & 0 deletions platform/mellanox/mlnx-platform-api/tests/dpupwr_inputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Input Data for dpuctl tests"""
testData = {
'power_off': ["dpu1: Power off forced=True\nAn error occurred: "
"FileNotFoundError - dpu1:"
"File /var/run/hw-management"
"/system/dpu1_pwr_force does not exist!\n",
"dpu1: Power off forced=False\nAn error occurred: "
"FileNotFoundError - dpu1:"
"File /var/run/hw-management"
"/system/dpu1_rst does not exist!\n",
],
'power_on': ["dpu1: Power on forced=True\nAn error occurred: "
"FileNotFoundError - dpu1:"
"File /var/run/hw-management"
"/system/dpu1_pwr_force does not exist!\n",
"dpu1: Power on forced=False\nAn error occurred: "
"FileNotFoundError - dpu1:File "
"/var/run/hw-management"
"/system/dpu1_pwr does not exist!\n",
],
'reset': ["dpu1: Reboot\nAn error occurred: "
"FileNotFoundError - dpu1:File /sys/bus/pci/devices/"
"dpu1_pciid/remove does not exist!\n",
],
'fw_upgrade': ["dpu1: FW upgrade\nAn error occurred: "
"FileNotFoundError - dpu1:File "
"/sys/bus/pci/devices/"
"dpu1_id/remove does not exist!\n",
],
}
Loading

0 comments on commit 8ef9836

Please sign in to comment.