From 1249e1d5be2323deedd3b04363cc2ea380903d52 Mon Sep 17 00:00:00 2001 From: Hunter Gregory <42728408+huntergregory@users.noreply.github.com> Date: Tue, 7 Jan 2025 20:43:53 -0800 Subject: [PATCH] fix: close telemetry handler before crashing Signed-off-by: Hunter Gregory <42728408+huntergregory@users.noreply.github.com> --- npm/cmd/start.go | 3 +++ npm/metrics/ai-utils.go | 11 +++++++++++ npm/pkg/dataplane/ipsets/ipsetmanager_linux.go | 1 + 3 files changed, 15 insertions(+) diff --git a/npm/cmd/start.go b/npm/cmd/start.go index 900fb8fa8a..d25b9e2c67 100644 --- a/npm/cmd/start.go +++ b/npm/cmd/start.go @@ -188,6 +188,7 @@ func start(config npmconfig.Config, flags npmconfig.Flags) error { nodeIP, err = util.NodeIP() if err != nil { metrics.SendErrorLogAndMetric(util.NpmID, "error: failed to get node IP while booting up: %v", err) + metrics.Close() return fmt.Errorf("failed to get node IP while booting up: %w", err) } klog.Infof("node IP is %s", nodeIP) @@ -197,6 +198,7 @@ func start(config npmconfig.Config, flags npmconfig.Flags) error { dp, err = dataplane.NewDataPlane(models.GetNodeName(), common.NewIOShim(), npmV2DataplaneCfg, stopChannel) if err != nil { metrics.SendErrorLogAndMetric(util.NpmID, "error: failed to create dataplane with error %v", err) + metrics.Close() return fmt.Errorf("failed to create dataplane with error %w", err) } dp.RunPeriodicTasks() @@ -210,6 +212,7 @@ func start(config npmconfig.Config, flags npmconfig.Flags) error { metrics.SendLog(util.NpmID, "starting NPM", metrics.PrintLog) if err = npMgr.Start(config, stopChannel); err != nil { metrics.SendErrorLogAndMetric(util.NpmID, "Failed to start NPM due to %+v", err) + metrics.Close() return fmt.Errorf("failed to start with err: %w", err) } diff --git a/npm/metrics/ai-utils.go b/npm/metrics/ai-utils.go index 20de3009ff..068fdb8f6a 100644 --- a/npm/metrics/ai-utils.go +++ b/npm/metrics/ai-utils.go @@ -11,6 +11,8 @@ import ( "k8s.io/klog" ) +const telemetryCloseWaitTimeSeconds = 10 + var ( th aitelemetry.TelemetryHandle npmVersion int @@ -54,6 +56,15 @@ func CreateTelemetryHandle(npmVersionNum int, imageVersion, aiMetadata string) e return nil } +// Close cleans up the telemetry handle, which effectively waits for all telemetry data to be sent +func Close() { + if th == nil { + return + } + + th.Close(telemetryCloseWaitTimeSeconds) +} + // SendErrorLogAndMetric sends a metric through AI telemetry and sends a log to the Kusto Messages table func SendErrorLogAndMetric(operationID int, format string, args ...interface{}) { // Send error metrics diff --git a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go index d654e589f3..1e1f2eaf7e 100644 --- a/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go +++ b/npm/pkg/dataplane/ipsets/ipsetmanager_linux.go @@ -422,6 +422,7 @@ func (iMgr *IPSetManager) applyIPSets() error { msg := fmt.Sprintf("exceeded max consecutive failures (%d) when applying ipsets. final error: %s", maxConsecutiveFailures, restoreError.Error()) klog.Error(msg) metrics.SendErrorLogAndMetric(util.IpsmID, msg) + metrics.Close() panic(msg) }