From d27ae865299e160f2e3ea8d5ca263f4adab7e97f Mon Sep 17 00:00:00 2001 From: Sonic Build Admin Date: Sat, 24 Jan 2026 02:55:41 +0000 Subject: [PATCH] Fix deadlock between syncd and orchagent syncd during initialization failure When syncd requests a shutdown, orchagent may be blocked waiting for a response to an init view (or other NOTIFY) command. Since syncd stops processing commands while waiting for the shutdown response, orchagent never receives its response and cannot acknowledge the shutdown request - resulting in a deadlock. This fix adds the selectable channel to the select loop during shutdown-wait mode and handles incoming commands appropriately: - NOTIFY commands receive a SAI_STATUS_FAILURE response to unblock the waiting orchagent - Other commands are logged and ignored This prevents orchagent from hanging until timout when syncd is failing. This fix - https://github.com/sonic-net/sonic-buildimage/issues/24799 --- syncd/Syncd.cpp | 51 ++++++++++++++++++++++++++-- syncd/Syncd.h | 3 ++ unittest/syncd/TestSyncd.cpp | 66 ++++++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 2 deletions(-) diff --git a/syncd/Syncd.cpp b/syncd/Syncd.cpp index 8a1cb186..6c51ecb9 100644 --- a/syncd/Syncd.cpp +++ b/syncd/Syncd.cpp @@ -350,6 +350,38 @@ void Syncd::processEvent( while (!consumer.empty()); } +void Syncd::processEventInShutdownWaitMode( + _In_ sairedis::SelectableChannel& consumer) +{ + SWSS_LOG_ENTER(); + + // Syncd in shutdown-wait mode must respond to INIT_VIEW with FAILURE to avoid deadlock with OA + // This could happen because Orchagent sends INIT_VIEW before registering shutdown callback + // Can't reorder due to circular dependency: need switch to register callbacks, but + // need INIT_VIEW before creating switch + do + { + swss::KeyOpFieldsValuesTuple kco; + consumer.pop(kco, false); + + auto& op = kfvOp(kco); + auto& key = kfvKey(kco); + + SWSS_LOG_WARN("Received command while in shutdown-wait mode: op=%s, key=%s.", op.c_str(), key.c_str()); + + if (op == REDIS_ASIC_STATE_COMMAND_NOTIFY) + { + SWSS_LOG_ERROR("Syncd is waiting for shutdown, cannot process %s: Sending FAILURE response.", key.c_str()); + sendNotifyResponse(SAI_STATUS_FAILURE); + } + else + { + SWSS_LOG_WARN("Ignoring non-notify command in shutdown-wait mode"); + } + } + while (!consumer.empty()); +} + sai_status_t Syncd::processSingleEvent( _In_ const swss::KeyOpFieldsValuesTuple &kco) { @@ -5566,6 +5598,8 @@ void Syncd::run() volatile bool runMainLoop = true; + bool inShutdownWaitMode = false; + std::shared_ptr s = std::make_shared(); try @@ -5603,6 +5637,9 @@ void Syncd::run() s = std::make_shared(); s->addSelectable(m_restartQuery.get()); + s->addSelectable(m_selectableChannel.get()); + + inShutdownWaitMode = true; SWSS_LOG_NOTICE("starting main loop, ONLY restart query"); @@ -5721,7 +5758,14 @@ void Syncd::run() } else if (sel == m_selectableChannel.get()) { - processEvent(*m_selectableChannel.get()); + if (inShutdownWaitMode) + { + processEventInShutdownWaitMode(*m_selectableChannel.get()); + } + else + { + processEvent(*m_selectableChannel.get()); + } } else { @@ -5730,13 +5774,16 @@ void Syncd::run() } catch(const std::exception &e) { - SWSS_LOG_ERROR("Runtime error: %s", e.what()); + SWSS_LOG_ERROR("Runtime error: %s - entering shutdown-wait mode", e.what()); sendShutdownRequestAfterException(); s = std::make_shared(); s->addSelectable(m_restartQuery.get()); + s->addSelectable(m_selectableChannel.get()); + + inShutdownWaitMode = true; if (m_commandLineOptions->m_disableExitSleep) runMainLoop = false; diff --git a/syncd/Syncd.h b/syncd/Syncd.h index bd59fcc2..6b9a9414 100644 --- a/syncd/Syncd.h +++ b/syncd/Syncd.h @@ -67,6 +67,9 @@ namespace syncd void processEvent( _In_ sairedis::SelectableChannel& consumer); + void processEventInShutdownWaitMode( + _In_ sairedis::SelectableChannel& consumer); + sai_status_t processQuadEventInInitViewMode( _In_ sai_object_type_t objectType, _In_ const std::string& strObjectId, diff --git a/unittest/syncd/TestSyncd.cpp b/unittest/syncd/TestSyncd.cpp index 22fa5618..6d74c07f 100644 --- a/unittest/syncd/TestSyncd.cpp +++ b/unittest/syncd/TestSyncd.cpp @@ -627,4 +627,70 @@ TEST_F(SyncdTest, BulkRemoveTest) }; m_syncd->processEvent(*channel); } + +TEST_F(SyncdTest, processEventInShutdownWaitMode_NotifyCommand) +{ + // Test that NOTIFY commands receive FAILURE response in shutdown-wait mode + auto channel = std::make_shared(); + + // Set up the mock channel as the syncd's selectable channel + // so sendNotifyResponse will use it + m_syncd->m_selectableChannel = channel; + + int popCallCount = 0; + EXPECT_CALL(*channel, empty()) + .WillRepeatedly([&popCallCount]() { + return popCallCount > 0; + }); + + EXPECT_CALL(*channel, pop(testing::_, testing::_)) + .WillOnce(testing::DoAll( + testing::Invoke([](swss::KeyOpFieldsValuesTuple& kco, bool initViewMode) { + kfvKey(kco) = SYNCD_INIT_VIEW; + kfvOp(kco) = REDIS_ASIC_STATE_COMMAND_NOTIFY; + }), + testing::Invoke([&popCallCount](swss::KeyOpFieldsValuesTuple&, bool) { + popCallCount++; + }) + )); + + // Verify that set() is called with FAILURE status response + std::string expectedStatus = sai_serialize_status(SAI_STATUS_FAILURE); + EXPECT_CALL(*channel, set(expectedStatus, testing::_, REDIS_ASIC_STATE_COMMAND_NOTIFY)) + .Times(1); + + m_syncd->processEventInShutdownWaitMode(*channel); +} + +TEST_F(SyncdTest, processEventInShutdownWaitMode_NonNotifyCommand) +{ + // Test that non-NOTIFY commands are ignored (no response sent) in shutdown-wait mode + auto channel = std::make_shared(); + + // Set up the mock channel as the syncd's selectable channel + m_syncd->m_selectableChannel = channel; + + int popCallCount = 0; + EXPECT_CALL(*channel, empty()) + .WillRepeatedly([&popCallCount]() { + return popCallCount > 0; + }); + + EXPECT_CALL(*channel, pop(testing::_, testing::_)) + .WillOnce(testing::DoAll( + testing::Invoke([](swss::KeyOpFieldsValuesTuple& kco, bool initViewMode) { + kfvKey(kco) = "SAI_OBJECT_TYPE_SWITCH:oid:0x21000000000000"; + kfvOp(kco) = REDIS_ASIC_STATE_COMMAND_CREATE; + }), + testing::Invoke([&popCallCount](swss::KeyOpFieldsValuesTuple&, bool) { + popCallCount++; + }) + )); + + // Verify that set() is NOT called for non-notify commands + EXPECT_CALL(*channel, set(testing::_, testing::_, testing::_)) + .Times(0); + + m_syncd->processEventInShutdownWaitMode(*channel); +} #endif