From d64473a7ae411e5caeffff6fa97abae927245ccf Mon Sep 17 00:00:00 2001
From: Zack Olson <zack@olsons.email>
Date: Fri, 15 Sep 2023 13:01:54 -0400
Subject: [PATCH] reduce early localserver -> osquery contention (#1359)

---
 ee/localserver/server.go | 33 +++++++++++----------------------
 1 file changed, 11 insertions(+), 22 deletions(-)
diff --git a/ee/localserver/server.go b/ee/localserver/server.go
index 5e15500fb..a8f355f91 100644
--- a/ee/localserver/server.go
+++ b/ee/localserver/server.go
@@ -20,7 +20,6 @@ import (
 	"github.com/kolide/krypto/pkg/echelper"
 	"github.com/kolide/launcher/pkg/agent"
 	"github.com/kolide/launcher/pkg/agent/types"
-	"github.com/kolide/launcher/pkg/backoff"
 	"github.com/kolide/launcher/pkg/osquery"
 	"go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"
 	"golang.org/x/time/rate"
@@ -214,36 +213,26 @@ func (ls *localServer) runAsyncdWorkers() time.Time {
 }
 
 func (ls *localServer) Start() error {
-	// Spawn background workers. This loop is a bit weird on startup. We want to populate this data as soon as we can, but because the underlying launcher
-	// run group isn't ordered, this is likely to happen before querier is ready. So we retry at a frequent interval for a couple of minutes, then we drop
-	// back to a slower poll interval. Note that this polling is merely a check against time, we don't repopulate this data nearly so often. (But we poll
-	// frequently to account for the difference between wall clock time, and sleep time)
+	// Spawn background workers. The information gathered here is not critical for DT flow- so to reduce early osquery contention
+	// we wait for <pollInterval> and before starting and then only rerun if the previous run was unsuccessful,
+	// or has been greater than <recalculateInterval>. Note that this polling is merely a check against time,
+	// we don't repopulate this data nearly so often. (But we poll frequently to account for the difference between
+	// wall clock time, and sleep time)
 	const (
-		initialPollInterval = 10 * time.Second
-		initialPollTimeout  = 2 * time.Minute
 		pollInterval        = 15 * time.Minute
 		recalculateInterval = 24 * time.Hour
 	)
+
 	go func() {
-		// Initial load, run pretty often, at least for the first chunk of time.
 		var lastRun time.Time
-		if err := backoff.WaitFor(func() error {
-			lastRun = ls.runAsyncdWorkers()
-			if (lastRun == time.Time{}) {
-				return errors.New("async tasks not success on initial boot (no surprise)")
-			}
-			return nil
-		},
-			initialPollTimeout,
-			initialPollInterval,
-		); err != nil {
-			level.Info(ls.logger).Log("message", "Initial async runs unsuccessful. Will retry in the future.", "err", err)
-		}
 
-		// Now that we're done with the initial population, fall back to a periodic polling
+		// note that this will trigger the check for the first time after pollInterval (not immediately)
 		for range time.Tick(pollInterval) {
-			if time.Since(lastRun) > (recalculateInterval) {
+			if time.Since(lastRun) > recalculateInterval {
 				lastRun = ls.runAsyncdWorkers()
+				if lastRun.IsZero() {
+					level.Debug(ls.logger).Log("message", "runAsyncdWorkers unsuccessful, will retry in the future.")
+				}
 			}
 		}
 	}()