From f286a04ad7c47216fece213bca47fddcc774f4d2 Mon Sep 17 00:00:00 2001 From: Andrea Panattoni Date: Fri, 4 Oct 2024 19:05:53 +0200 Subject: [PATCH] config-daemon: Restart all instances of device-plugin When the operator changes the device-plugin Spec (e.g. .Spec.NodeSelector), it may happen that there are two device plugin pods for a given node, one that is terminating, the other that is initializing. If the config-daemon executes `restartDevicePluginPod()` at the same time, it may kill the terminating pod, while the initializing one will run with the old dp configuration. This may cause one or more resources to not being advertised, until a manual device plugin restart occurs. Make the config-daemon restart all the device-plugin instances it founds for its own node. Signed-off-by: Andrea Panattoni --- pkg/daemon/daemon.go | 53 +++++++++++++++++----------------- pkg/daemon/daemon_test.go | 61 ++++++++++++++++++++++++++++++--------- 2 files changed, 74 insertions(+), 40 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 5ed31ff85..ff7f326dc 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -120,6 +120,7 @@ func New( eventRecorder: er, featureGate: featureGates, disabledPlugins: disabledPlugins, + mu: &sync.Mutex{}, } } @@ -159,7 +160,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { var timeout int64 = 5 var metadataKey = "metadata.name" - dn.mu = &sync.Mutex{} informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient, time.Second*15, vars.Namespace, @@ -683,7 +683,6 @@ func (dn *Daemon) restartDevicePluginPod() error { defer dn.mu.Unlock() log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod") - var podToDelete string pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{ LabelSelector: "app=sriov-device-plugin", FieldSelector: "spec.nodeName=" + vars.NodeName, @@ -702,35 +701,37 @@ func (dn *Daemon) restartDevicePluginPod() error { log.Log.Info("restartDevicePluginPod(): device plugin pod exited") return nil } - podToDelete = pods.Items[0].Name - log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete) - err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{}) - if errors.IsNotFound(err) { - log.Log.Info("restartDevicePluginPod(): pod to delete not found") - return nil - } - if err != nil { - log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying") - return err - } - - if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) { - _, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{}) + for _, pod := range pods.Items { + podToDelete := pod.Name + log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete) + err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{}) if errors.IsNotFound(err) { - log.Log.Info("restartDevicePluginPod(): device plugin pod exited") - return true, nil + log.Log.Info("restartDevicePluginPod(): pod to delete not found") + continue } - if err != nil { - log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying") - } else { - log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete) + log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying") + return err + } + + if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) { + _, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{}) + if errors.IsNotFound(err) { + log.Log.Info("restartDevicePluginPod(): device plugin pod exited") + return true, nil + } + + if err != nil { + log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying") + } else { + log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete) + } + return false, nil + }, dn.stopCh); err != nil { + log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion") + return err } - return false, nil - }, dn.stopCh); err != nil { - log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion") - return err } return nil diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index f1111810a..67a56633f 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -32,6 +32,8 @@ import ( "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem" ) +var SriovDevicePluginPod corev1.Pod + func TestConfigDaemon(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "Config Daemon Suite") @@ -107,19 +109,6 @@ var _ = Describe("Config Daemon", func() { }, } - SriovDevicePluginPod := corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "sriov-device-plugin-xxxx", - Namespace: vars.Namespace, - Labels: map[string]string{ - "app": "sriov-device-plugin", - }, - }, - Spec: corev1.PodSpec{ - NodeName: "test-node", - }, - } - err = sriovnetworkv1.AddToScheme(scheme.Scheme) Expect(err).ToNot(HaveOccurred()) kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{ @@ -130,7 +119,7 @@ var _ = Describe("Config Daemon", func() { Namespace: vars.Namespace, }}).Build() - kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod) + kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs) snclient := snclientset.NewSimpleClientset() err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace) Expect(err).ToNot(HaveOccurred()) @@ -175,6 +164,22 @@ var _ = Describe("Config Daemon", func() { err := sut.Run(stopCh, exitCh) Expect(err).ToNot(HaveOccurred()) }() + + SriovDevicePluginPod = corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sriov-device-plugin-xxxx", + Namespace: vars.Namespace, + Labels: map[string]string{ + "app": "sriov-device-plugin", + }, + }, + Spec: corev1.PodSpec{ + NodeName: "test-node", + }, + } + _, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + }) AfterEach(func() { @@ -286,6 +291,34 @@ var _ = Describe("Config Daemon", func() { Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777)) }) + + It("restart all the sriov-device-plugin pods present on the node", func() { + otherPod1 := SriovDevicePluginPod.DeepCopy() + otherPod1.Name = "sriov-device-plugin-xxxa" + _, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + otherPod2 := SriovDevicePluginPod.DeepCopy() + otherPod2.Name = "sriov-device-plugin-xxxz" + _, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + err = sut.restartDevicePluginPod() + Expect(err).ToNot(HaveOccurred()) + + Eventually(func() (int, error) { + podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{ + LabelSelector: "app=sriov-device-plugin", + FieldSelector: "spec.nodeName=test-node", + }) + + if err != nil { + return 0, err + } + + return len(podList.Items), nil + }, "1s").Should(BeZero()) + }) }) })