From 2c5fd51a7f753d67b5bc1f2c4121849fa398c717 Mon Sep 17 00:00:00 2001 From: mssonicbld <79238446+mssonicbld@users.noreply.github.com> Date: Fri, 26 Sep 2025 08:12:53 +0800 Subject: [PATCH] [Smartswitch] Restart sensord with lock (#602) #### Description #### Motivation and Context On startup of pmon in dark mode on switch, there are four processes spawned to power off DPUs, if all are executed in parallel there are four sensord processes running (since service restart commands are not queued). Since we perform handle_sensor_removal on dark mode start, the solution is to have a file lock to make sure only one module has control to restart sensord each time, queueing the sensord restart requests so that we have a single process running always, irrespective of number of restarts #### How Has This Been Tested? Check the number of sensord processes starting before fix: ``` docker exec -it pmon ps -x|grep sensord 170 ? Ss 0:00 /usr/sbin/sensord -f daemon 171 ? Ss 0:00 /usr/sbin/sensord -f daemon 172 ? Ss 0:00 /usr/sbin/sensord -f daemon 173 ? Ss 0:00 /usr/sbin/sensord -f daemon ``` after Addition of fix, only one process is found ``` root@sonic:/home/admin# docker exec -it pmon ps -x|grep sensord 189 ? Ss 0:00 /usr/sbin/sensord -f daemon ``` #### Additional Information (Optional) --- sonic_platform_base/module_base.py | 30 ++++++++++++++---- tests/module_base_test.py | 51 +++++++++++++++++++++++++++--- 2 files changed, 70 insertions(+), 11 deletions(-) diff --git a/sonic_platform_base/module_base.py b/sonic_platform_base/module_base.py index d4d0ba4..bd33357 100644 --- a/sonic_platform_base/module_base.py +++ b/sonic_platform_base/module_base.py @@ -27,6 +27,7 @@ class ModuleBase(device_base.DeviceBase): # Device type definition. Note, this is a constant. DEVICE_TYPE = "module" PCI_OPERATION_LOCK_FILE_PATH = "/var/lock/{}_pci.lock" + SENSORD_OPERATION_LOCK_FILE_PATH = "/var/lock/sensord.lock" # Possible card types for modular chassis MODULE_TYPE_SUPERVISOR = "SUPERVISOR" @@ -96,9 +97,8 @@ def __init__(self): self._asic_list = [] @contextlib.contextmanager - def _pci_operation_lock(self): - """File-based lock for PCI operations using flock""" - lock_file_path = self.PCI_OPERATION_LOCK_FILE_PATH.format(self.get_name()) + def _file_operation_lock(self, lock_file_path): + """Common file-based lock for operations using flock""" with open(lock_file_path, 'w') as f: try: fcntl.flock(f.fileno(), fcntl.LOCK_EX) @@ -106,6 +106,20 @@ def _pci_operation_lock(self): finally: fcntl.flock(f.fileno(), fcntl.LOCK_UN) + @contextlib.contextmanager + def _pci_operation_lock(self): + """File-based lock for PCI operations using flock""" + lock_file_path = self.PCI_OPERATION_LOCK_FILE_PATH.format(self.get_name()) + with self._file_operation_lock(lock_file_path): + yield + + @contextlib.contextmanager + def _sensord_operation_lock(self): + """File-based lock for sensord operations using flock""" + lock_file_path = self.SENSORD_OPERATION_LOCK_FILE_PATH + with self._file_operation_lock(lock_file_path): + yield + def get_base_mac(self): """ Retrieves the base MAC address for the module @@ -791,8 +805,9 @@ def handle_sensor_removal(self): shutil.copy2(source_file, target_file) - # Restart sensord - os.system("service sensord restart") + # Restart sensord with lock + with self._sensord_operation_lock(): + os.system("service sensord restart") return True except Exception as e: @@ -818,8 +833,9 @@ def handle_sensor_addition(self): # Remove the file os.remove(target_file) - # Restart sensord - os.system("service sensord restart") + # Restart sensord with lock + with self._sensord_operation_lock(): + os.system("service sensord restart") return True except Exception as e: diff --git a/tests/module_base_test.py b/tests/module_base_test.py index b041f43..025849e 100644 --- a/tests/module_base_test.py +++ b/tests/module_base_test.py @@ -87,6 +87,23 @@ def test_pci_entry_state_db(self): mock_connector.hset.side_effect = Exception("DB Error") module.pci_entry_state_db("0000:00:00.0", "detaching") + def test_file_operation_lock(self): + module = ModuleBase() + mock_file = MockFile() + + with patch('builtins.open', return_value=mock_file) as mock_file_open, \ + patch('fcntl.flock') as mock_flock, \ + patch('os.makedirs') as mock_makedirs: + + with module._file_operation_lock("/var/lock/test.lock"): + mock_flock.assert_called_with(123, fcntl.LOCK_EX) + + mock_flock.assert_has_calls([ + call(123, fcntl.LOCK_EX), + call(123, fcntl.LOCK_UN) + ]) + assert mock_file.fileno_called + def test_pci_operation_lock(self): module = ModuleBase() mock_file = MockFile() @@ -105,6 +122,24 @@ def test_pci_operation_lock(self): ]) assert mock_file.fileno_called + def test_sensord_operation_lock(self): + module = ModuleBase() + mock_file = MockFile() + + with patch('builtins.open', return_value=mock_file) as mock_file_open, \ + patch('fcntl.flock') as mock_flock, \ + patch.object(module, 'get_name', return_value="DPU0"), \ + patch('os.makedirs') as mock_makedirs: + + with module._sensord_operation_lock(): + mock_flock.assert_called_with(123, fcntl.LOCK_EX) + + mock_flock.assert_has_calls([ + call(123, fcntl.LOCK_EX), + call(123, fcntl.LOCK_UN) + ]) + assert mock_file.fileno_called + def test_handle_pci_removal(self): module = ModuleBase() @@ -141,19 +176,23 @@ def test_handle_sensor_removal(self): with patch.object(module, 'get_name', return_value="DPU0"), \ patch('os.path.exists', return_value=True), \ patch('shutil.copy2') as mock_copy, \ - patch('os.system') as mock_system: + patch('os.system') as mock_system, \ + patch.object(module, '_sensord_operation_lock') as mock_lock: assert module.handle_sensor_removal() is True mock_copy.assert_called_once_with("/usr/share/sonic/platform/module_sensors_ignore_conf/ignore_sensors_DPU0.conf", "/etc/sensors.d/ignore_sensors_DPU0.conf") mock_system.assert_called_once_with("service sensord restart") + mock_lock.assert_called_once() with patch.object(module, 'get_name', return_value="DPU0"), \ patch('os.path.exists', return_value=False), \ patch('shutil.copy2') as mock_copy, \ - patch('os.system') as mock_system: + patch('os.system') as mock_system, \ + patch.object(module, '_sensord_operation_lock') as mock_lock: assert module.handle_sensor_removal() is True mock_copy.assert_not_called() mock_system.assert_not_called() + mock_lock.assert_not_called() with patch.object(module, 'get_name', return_value="DPU0"), \ patch('os.path.exists', return_value=True), \ @@ -166,18 +205,22 @@ def test_handle_sensor_addition(self): with patch.object(module, 'get_name', return_value="DPU0"), \ patch('os.path.exists', return_value=True), \ patch('os.remove') as mock_remove, \ - patch('os.system') as mock_system: + patch('os.system') as mock_system, \ + patch.object(module, '_sensord_operation_lock') as mock_lock: assert module.handle_sensor_addition() is True mock_remove.assert_called_once_with("/etc/sensors.d/ignore_sensors_DPU0.conf") mock_system.assert_called_once_with("service sensord restart") + mock_lock.assert_called_once() with patch.object(module, 'get_name', return_value="DPU0"), \ patch('os.path.exists', return_value=False), \ patch('os.remove') as mock_remove, \ - patch('os.system') as mock_system: + patch('os.system') as mock_system, \ + patch.object(module, '_sensord_operation_lock') as mock_lock: assert module.handle_sensor_addition() is True mock_remove.assert_not_called() mock_system.assert_not_called() + mock_lock.assert_not_called() with patch.object(module, 'get_name', return_value="DPU0"), \ patch('os.path.exists', return_value=True), \