From f13f0ab27a6a9ad4dcb435d1a96f69c612f368d3 Mon Sep 17 00:00:00 2001 From: Soule Date: Fri, 11 Oct 2024 17:45:47 +0300 Subject: [PATCH 1/2] Clean Systemd files on exit If implemented all systemD related files will be cleaned up on exit. --- cmd/sriov-network-config-daemon/start.go | 58 ++++++++++++++++-------- pkg/daemon/daemon.go | 8 ++++ 2 files changed, 48 insertions(+), 18 deletions(-) diff --git a/cmd/sriov-network-config-daemon/start.go b/cmd/sriov-network-config-daemon/start.go index b1ad0f667..754973403 100644 --- a/cmd/sriov-network-config-daemon/start.go +++ b/cmd/sriov-network-config-daemon/start.go @@ -21,7 +21,9 @@ import ( "net" "net/url" "os" + "os/signal" "strings" + "syscall" "time" "github.com/spf13/cobra" @@ -134,7 +136,6 @@ func runStartCmd(cmd *cobra.Command, args []string) error { // This channel is used to ensure all spawned goroutines exit when we exit. stopCh := make(chan struct{}) - defer close(stopCh) // This channel is used to signal Run() something failed and to jump ship. // It's purely a chan<- in the Daemon struct for goroutines to write to, and @@ -286,6 +287,7 @@ func runStartCmd(cmd *cobra.Command, args []string) error { err = kClient.Get(context.Background(), types.NamespacedName{Namespace: vars.Namespace, Name: consts.DefaultConfigName}, defaultConfig) if err != nil { log.Log.Error(err, "Failed to get default SriovOperatorConfig object") + close(stopCh) return err } featureGates := featuregate.New() @@ -294,25 +296,45 @@ func runStartCmd(cmd *cobra.Command, args []string) error { log.Log.Info("Enabled featureGates", "featureGates", featureGates.String()) setupLog.V(0).Info("Starting SriovNetworkConfigDaemon") - err = daemon.New( - kClient, - snclient, - kubeclient, - hostHelpers, - platformHelper, - exitCh, - stopCh, - syncCh, - refreshCh, - eventRecorder, - featureGates, - startOpts.disabledPlugins, - ).Run(stopCh, exitCh) - if err != nil { - setupLog.Error(err, "failed to run daemon") + + // create a signal channel to catch interrupts and gracefully shutdown the daemon + sigc := make(chan os.Signal, 1) + signal.Notify(sigc, os.Interrupt) + signal.Notify(sigc, syscall.SIGTERM) + + errChan := make(chan error) + defer close(errChan) + go func() { + errChan <- daemon.New( + kClient, + snclient, + kubeclient, + hostHelpers, + platformHelper, + exitCh, + stopCh, + syncCh, + refreshCh, + eventRecorder, + featureGates, + startOpts.disabledPlugins, + ).Run(stopCh, exitCh) + }() + + select { + case err := <-errChan: + // daemon has exited, close the stop channel and return the error + close(stopCh) + return err + case <-sigc: + // signal received, close the stop channel and wait for the daemon to exit + close(stopCh) + if err := <-errChan; err != nil { + return err + } } setupLog.V(0).Info("Shutting down SriovNetworkConfigDaemon") - return err + return nil } // updateDialer instruments a restconfig with a dial. the returned function allows forcefully closing all active connections. diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index ff7f326dc..56595a61c 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -207,6 +207,14 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { for { select { case <-stopCh: + // clean files from host if we are running in systemd mode + if vars.UsingSystemdMode { + err := systemd.CleanSriovFilesFromHost(vars.ClusterType == consts.ClusterTypeOpenshift) + if err != nil { + log.Log.Error(err, "failed to remove all the systemd sriov files") + return err + } + } log.Log.V(0).Info("Run(): stop daemon") return nil case err, more := <-exitCh: From 9c8ed6e3cbed193cc8ccc35bbdb7a7da528269be Mon Sep 17 00:00:00 2001 From: Soule BA Date: Tue, 15 Oct 2024 10:08:10 +0200 Subject: [PATCH 2/2] cleanup systemd files only if no reboot is required Signed-off-by: Soule BA --- pkg/daemon/daemon.go | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 56595a61c..1f770a227 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -77,7 +77,7 @@ type Daemon struct { refreshCh chan<- Message - mu *sync.Mutex + mu sync.Mutex disableDrain bool @@ -120,7 +120,6 @@ func New( eventRecorder: er, featureGate: featureGates, disabledPlugins: disabledPlugins, - mu: &sync.Mutex{}, } } @@ -207,8 +206,14 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { for { select { case <-stopCh: - // clean files from host if we are running in systemd mode - if vars.UsingSystemdMode { + // clean files from host if we are running in systemd mode and the node + // is not required to be rebooted + dn.mu.Lock() + rebootrequired := utils.ObjectHasAnnotation(dn.desiredNodeState, + consts.NodeStateDrainAnnotation, consts.RebootRequired) + dn.mu.Unlock() + + if vars.UsingSystemdMode && !rebootrequired { err := systemd.CleanSriovFilesFromHost(vars.ClusterType == consts.ClusterTypeOpenshift) if err != nil { log.Log.Error(err, "failed to remove all the systemd sriov files") @@ -324,6 +329,8 @@ func (dn *Daemon) operatorConfigChangeHandler(old, new interface{}) { } func (dn *Daemon) nodeStateSyncHandler() error { + dn.mu.Lock() + defer dn.mu.Unlock() var err error // Get the latest NodeState var sriovResult = &systemd.SriovResult{SyncStatus: consts.SyncStatusSucceeded, LastSyncError: ""} @@ -687,8 +694,6 @@ func (dn *Daemon) handleDrain(reqReboot bool) (bool, error) { } func (dn *Daemon) restartDevicePluginPod() error { - dn.mu.Lock() - defer dn.mu.Unlock() log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod") pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{