Skip to content

Commit 0118a49

Browse files
committed
Fix: removing secondary interface in Asyc delete for SWiftV2 stateless CNI
1 parent 72f22e8 commit 0118a49

File tree

7 files changed

+108
-8
lines changed

7 files changed

+108
-8
lines changed

cni/network/network.go

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1071,13 +1071,24 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
10711071
epInfos, err = plugin.nm.GetEndpointState(networkID, args.ContainerID)
10721072
// if stateless CNI fail to get the endpoint from CNS for any reason other than Endpoint Not found or CNS connection failure
10731073
// return a retriable error so the container runtime will retry this DEL later
1074-
// the implementation of this function returns nil if the endpoint doens't exist, so
1074+
// the implementation of this function returns nil if the endpoint doesn't exist, so
10751075
// we don't have to check that here
10761076
if err != nil {
10771077
switch {
10781078
case errors.Is(err, network.ErrConnectionFailure):
10791079
logger.Error("Failed to connect to CNS", zap.Error(err))
10801080
logger.Info("Endpoint will be deleted from state file asynchronously", zap.String("containerID", args.ContainerID))
1081+
// In SwiftV2 Linux stateless CNI mode, if the plugin cannot connect to CNS,
1082+
// we asynchronously remove the secondary (delegated) interface from the pod’s network namespace in the absence of the endpoint state.
1083+
// This is necessary because leaving the delegated NIC in the pod netns can cause the kernel to block rtnetlink operations.
1084+
// When that happens, kubelet and containerd hang during sandbox creation or teardown.
1085+
// The delegated NIC (SR-IOV VF) used by SwiftV2 for multitenant pods remains tied to the pod namespace,
1086+
// triggering hot-unplug/re-register events and leaving the node in an unhealthy state.
1087+
// This workaround mitigates the issue by removing the secondary NIC from the pod netns when CNS is unreachable during DEL to provide the endpoint state.
1088+
if err = plugin.nm.RemoveSecondaryEndpointFromPodNetNS(args.IfName, args.Netns); err != nil {
1089+
logger.Error("Failed to remove secondary endpoint from pod netns", zap.String("netns", args.Netns), zap.Error(err))
1090+
return plugin.RetriableError(fmt.Errorf("failed to remove secondary endpoint from pod netns: %w", err))
1091+
}
10811092
case errors.Is(err, network.ErrEndpointStateNotFound):
10821093
logger.Info("Endpoint Not found", zap.String("containerID", args.ContainerID), zap.Error(err))
10831094
return nil
@@ -1088,15 +1099,15 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
10881099
} else {
10891100
for i, epInfo := range epInfos {
10901101
logger.Info("Found endpoint to delete", zap.String("IfName", epInfo.IfName), zap.String("EndpointID", epInfo.EndpointID), zap.Any("NICType", epInfo.NICType))
1091-
epInfos[i].NetNsPath = args.Netns // in case DelegatedNIC need to be moved to host namespace
1102+
epInfos[i].NetNsPath = args.Netns // This is needed in SSwiftV2 scenario to move DelegatedNIC to host namespace
10921103
}
10931104
}
10941105
} else {
10951106
epInfos = plugin.nm.GetEndpointInfosFromContainerID(args.ContainerID)
10961107
}
10971108

10981109
// for Stateful CNI when the endpoint is not created, but the ips are already allocated (only works if single network, single infra)
1099-
// this block is applied to stateless CNI only if there was a connection failure in previous block
1110+
// this block is applied to stateless CNI only if there was a connection failure in previous block and asynchronous delete by CNS will remover the endpoint from state file
11001111
if len(epInfos) == 0 {
11011112
endpointID := plugin.nm.GetEndpointID(args.ContainerID, args.IfName)
11021113
if !nwCfg.MultiTenancy {
@@ -1122,7 +1133,7 @@ func (plugin *NetPlugin) Delete(args *cniSkel.CmdArgs) error {
11221133
if err = plugin.nm.DeleteEndpoint(epInfo.NetworkID, epInfo.EndpointID, epInfo); err != nil {
11231134
// An error will not be returned if the endpoint is not found
11241135
// return a retriable error so the container runtime will retry this DEL later
1125-
// the implementation of this function returns nil if the endpoint doens't exist, so
1136+
// the implementation of this function returns nil if the endpoint doesn't exist, so
11261137
// we don't have to check that here
11271138
return plugin.RetriableError(fmt.Errorf("failed to delete endpoint: %w", err))
11281139
}

cns/restserver/restserver.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ type IPInfo struct {
132132
HnsNetworkID string `json:",omitempty"`
133133
HostVethName string `json:",omitempty"`
134134
MacAddress string `json:",omitempty"`
135-
NICType cns.NICType `json:",omitempty"`
135+
NICType cns.NICType
136136
}
137137

138138
type GetHTTPServiceDataResponse struct {

network/endpoint_linux.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,3 +547,12 @@ func getDefaultGateway(routes []RouteInfo) net.IP {
547547
func (epInfo *EndpointInfo) GetEndpointInfoByIPImpl(_ []net.IPNet, _ string) (*EndpointInfo, error) {
548548
return epInfo, nil
549549
}
550+
551+
// removeSecondaryEndpointFromPodNetNSImpl deletes an existing secondary endpoint from the pod network namespace.
552+
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(nsc NamespaceClientInterface) error {
553+
secondaryepClient := NewSecondaryEndpointClient(nil, nil, nil, nsc, nil, ep)
554+
if err := secondaryepClient.RemoveInterfacesfromNetnsPath(ep.IfName, ep.NetworkNameSpace); err != nil {
555+
return err
556+
}
557+
return nil
558+
}

network/endpoint_windows.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,3 +746,8 @@ func getPnpDeviceState(instanceID string, plc platform.ExecClient) (string, stri
746746
logger.Info("Retrieved device problem code", zap.String("code", devpkeyDeviceProblemCode))
747747
return devpkeyDeviceIsPresent, devpkeyDeviceProblemCode, nil
748748
}
749+
750+
// removeSecondaryEndpointFromPodNetNSImpl removes an existing secondary endpoint from the pod network namespace.
751+
func (ep *endpoint) removeSecondaryEndpointFromPodNetNSImpl(_ NamespaceClientInterface) error {
752+
return nil
753+
}

network/manager.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ type NetworkManager interface {
122122
DeleteState(epInfos []*EndpointInfo) error
123123
GetEndpointInfosFromContainerID(containerID string) []*EndpointInfo
124124
GetEndpointState(networkID, containerID string) ([]*EndpointInfo, error)
125+
RemoveSecondaryEndpointFromPodNetNS(ifName string, netns string) error
125126
}
126127

127128
// Creates a new network manager.
@@ -860,3 +861,14 @@ func generateCNSIPInfoMap(eps []*endpoint) map[string]*restserver.IPInfo {
860861

861862
return ifNametoIPInfoMap
862863
}
864+
865+
// RemoveSecondaryEndpointFromPodNetNS removes the secondary endpoint from the pod netns
866+
func (nm *networkManager) RemoveSecondaryEndpointFromPodNetNS(ifName, netns string) error {
867+
ep := &endpoint{
868+
NetworkNameSpace: netns,
869+
IfName: ifName, // TODO: For stateless cni linux populate IfName here to use in deletion in secondary endpoint client
870+
}
871+
logger.Info("Removing Secondary Endpoint from", zap.String("NetworkNameSpace: ", netns))
872+
err := ep.removeSecondaryEndpointFromPodNetNSImpl(nm.nsClient)
873+
return err
874+
}

network/manager_mock.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,3 +221,7 @@ func (nm *MockNetworkManager) GetEndpointInfosFromContainerID(containerID string
221221
func (nm *MockNetworkManager) GetEndpointState(_, _ string) ([]*EndpointInfo, error) {
222222
return []*EndpointInfo{}, nil
223223
}
224+
225+
func (nm *MockNetworkManager) RemoveSecondaryEndpointFromPodNetNS(_, _ string) error {
226+
return nil
227+
}

network/secondary_endpoint_client_linux.go

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"github.com/Azure/azure-container-networking/network/networkutils"
1414
"github.com/Azure/azure-container-networking/platform"
1515
"github.com/pkg/errors"
16+
vishnetlink "github.com/vishvananda/netlink"
1617
"go.uber.org/zap"
1718
"k8s.io/kubernetes/pkg/kubelet"
1819
)
@@ -190,13 +191,13 @@ func (client *SecondaryEndpointClient) DeleteEndpoints(ep *endpoint) error {
190191
}
191192
}()
192193
// For stateless cni linux, check if delegated vmnic type, and if so, delete using this *endpoint* struct's ifname
193-
if ep.NICType == cns.DelegatedVMNIC {
194+
if ep.NICType == cns.NodeNetworkInterfaceFrontendNIC {
194195
if err := client.moveInterfaceToHostNetns(ep.IfName, vmns); err != nil {
195196
logger.Error("Failed to move interface", zap.String("IfName", ep.IfName), zap.Error(newErrorSecondaryEndpointClient(err)))
196197
}
197198
}
198-
// For Statefull cni linux, Use SecondaryInterfaces map to move all interfaces to host netns
199-
// TODO: SecondaryInterfaces map should be retired and only IfName field and NICType should be used to determine the delgated NIC
199+
// For Stateful cni linux, Use SecondaryInterfaces map to move all interfaces to host netns
200+
// TODO: SecondaryInterfaces map should be retired and only IfName field and NICType should be used to determine the delegated NIC
200201
for iface := range ep.SecondaryInterfaces {
201202
if err := client.moveInterfaceToHostNetns(iface, vmns); err != nil {
202203
logger.Error("Failed to move interface", zap.String("IfName", iface), zap.Error(newErrorSecondaryEndpointClient(err)))
@@ -216,3 +217,61 @@ func (client *SecondaryEndpointClient) moveInterfaceToHostNetns(ifName string, v
216217
}
217218
return nil
218219
}
220+
221+
// RemoveInterfacesfromNetnsPath finds and removes all interfaces from the specified netns path except the infra and non-eth interfaces.
222+
func (client *SecondaryEndpointClient) RemoveInterfacesfromNetnsPath(infraInterfaceName, netnspath string) error {
223+
// Get VM namespace
224+
vmns, err := netns.New().Get()
225+
if err != nil {
226+
return newErrorSecondaryEndpointClient(err)
227+
}
228+
229+
// Open the network namespace.
230+
logger.Info("Opening netns", zap.Any("NetNsPath", netnspath))
231+
ns, err := client.nsClient.OpenNamespace(netnspath)
232+
if err != nil {
233+
return newErrorSecondaryEndpointClient(err)
234+
}
235+
defer ns.Close()
236+
237+
// Enter the container network namespace.
238+
logger.Info("Entering netns", zap.Any("NetNsPath", netnspath))
239+
if err = ns.Enter(); err != nil {
240+
if errors.Is(err, os.ErrNotExist) {
241+
return nil
242+
}
243+
244+
return newErrorSecondaryEndpointClient(err)
245+
}
246+
247+
// Return to host network namespace.
248+
defer func() {
249+
logger.Info("Exiting netns", zap.Any("NetNsPath", netnspath))
250+
if err = ns.Exit(); err != nil {
251+
logger.Error("Failed to exit netns with", zap.Error(newErrorSecondaryEndpointClient(err)))
252+
}
253+
}()
254+
// Use the existing netlink interface to list links
255+
links, err := vishnetlink.LinkList()
256+
if err != nil {
257+
return newErrorSecondaryEndpointClient(err)
258+
}
259+
260+
ifnames := make([]string, 0, len(links))
261+
for _, l := range links {
262+
ifnames = append(ifnames, l.Attrs().Name)
263+
}
264+
logger.Info("Found interfaces in netns that needs to be moved back to host", zap.Any("interfaces", ifnames))
265+
// For stateless cni linux, check if delegated vmnic type, and if so, delete using this *endpoint* struct's ifname
266+
for _, iface := range ifnames {
267+
// skip the infra interface as well as non-eth interfaces
268+
if iface == infraInterfaceName || !strings.Contains(iface, "eth") {
269+
continue
270+
}
271+
if err := client.moveInterfaceToHostNetns(iface, vmns); err != nil {
272+
logger.Error("Failed to move interface", zap.String("IfName", iface), zap.Error(newErrorSecondaryEndpointClient(err)))
273+
}
274+
}
275+
276+
return nil
277+
}

0 commit comments

Comments
 (0)