From d64473a7ae411e5caeffff6fa97abae927245ccf Mon Sep 17 00:00:00 2001 From: Zack Olson Date: Fri, 15 Sep 2023 13:01:54 -0400 Subject: [PATCH] reduce early localserver -> osquery contention (#1359) --- ee/localserver/server.go | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/ee/localserver/server.go b/ee/localserver/server.go index 5e15500fb..a8f355f91 100644 --- a/ee/localserver/server.go +++ b/ee/localserver/server.go @@ -20,7 +20,6 @@ import ( "github.com/kolide/krypto/pkg/echelper" "github.com/kolide/launcher/pkg/agent" "github.com/kolide/launcher/pkg/agent/types" - "github.com/kolide/launcher/pkg/backoff" "github.com/kolide/launcher/pkg/osquery" "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" "golang.org/x/time/rate" @@ -214,36 +213,26 @@ func (ls *localServer) runAsyncdWorkers() time.Time { } func (ls *localServer) Start() error { - // Spawn background workers. This loop is a bit weird on startup. We want to populate this data as soon as we can, but because the underlying launcher - // run group isn't ordered, this is likely to happen before querier is ready. So we retry at a frequent interval for a couple of minutes, then we drop - // back to a slower poll interval. Note that this polling is merely a check against time, we don't repopulate this data nearly so often. (But we poll - // frequently to account for the difference between wall clock time, and sleep time) + // Spawn background workers. The information gathered here is not critical for DT flow- so to reduce early osquery contention + // we wait for and before starting and then only rerun if the previous run was unsuccessful, + // or has been greater than . Note that this polling is merely a check against time, + // we don't repopulate this data nearly so often. (But we poll frequently to account for the difference between + // wall clock time, and sleep time) const ( - initialPollInterval = 10 * time.Second - initialPollTimeout = 2 * time.Minute pollInterval = 15 * time.Minute recalculateInterval = 24 * time.Hour ) + go func() { - // Initial load, run pretty often, at least for the first chunk of time. var lastRun time.Time - if err := backoff.WaitFor(func() error { - lastRun = ls.runAsyncdWorkers() - if (lastRun == time.Time{}) { - return errors.New("async tasks not success on initial boot (no surprise)") - } - return nil - }, - initialPollTimeout, - initialPollInterval, - ); err != nil { - level.Info(ls.logger).Log("message", "Initial async runs unsuccessful. Will retry in the future.", "err", err) - } - // Now that we're done with the initial population, fall back to a periodic polling + // note that this will trigger the check for the first time after pollInterval (not immediately) for range time.Tick(pollInterval) { - if time.Since(lastRun) > (recalculateInterval) { + if time.Since(lastRun) > recalculateInterval { lastRun = ls.runAsyncdWorkers() + if lastRun.IsZero() { + level.Debug(ls.logger).Log("message", "runAsyncdWorkers unsuccessful, will retry in the future.") + } } } }()