tgoodwin · tgoodwin · Nov 22, 2025 · Nov 22, 2025
diff --git a/examples/knative-serving/main.go b/examples/knative-serving/main.go
@@ -6,6 +6,8 @@ import (
 	"encoding/hex"
 	"flag"
 	"fmt"
+	"net/http"
+	_ "net/http/pprof"
 	"os"
 	"reflect"
 	"strings"
@@ -168,6 +170,15 @@ func main() {
 
 	tracecheck.SetLogger(logf.Log.WithName("tracecheck"))
 
+	go func() {
+		addr := "localhost:6060"
+		pprofLog := logf.Log.WithName("pprof")
+		pprofLog.Info("starting pprof server", "addr", addr)
+		if err := http.ListenAndServe(addr, nil); err != nil {
+			pprofLog.Error(err, "pprof server exited")
+		}
+	}()
+
 	explorer, initialState, err := newKnativeExplorerAndState(*searchDepth, *emitStatsFlag)
 	if err != nil {
 		panic(fmt.Sprintf("Build() error = %v", err))

diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go
@@ -657,6 +657,10 @@ func (e *Explorer) takeReconcileStep(ctx context.Context, state StateNode, pr Pe
 
 	// invoke the controller at its observed state of the world
 	observableState := state.ObserveAs(pr.ReconcilerID)
+	if e.stats != nil && e.config != nil && e.config.EnablePerfStats {
+		// TODO remove before merging: temporary rehydration instrumentation
+		e.stats.RecordRehydration(pr.ReconcilerID, len(observableState))
+	}
 	stepLog.WithValues("ReconcilerID", pr.ReconcilerID, "FrameID", frameID).V(2).Info("about to reconcile")
 
 	reconcileResult, err := e.reconcileAtState(ctx, observableState, pr)

diff --git a/pkg/tracecheck/stats.go b/pkg/tracecheck/stats.go
@@ -19,6 +19,13 @@ type ExploreStats struct {
 	TotalStepLatency    time.Duration
 	MaxStepLatency      time.Duration
 	StepLatencyByRecon  map[string]*latencyStat
+
+	// TODO remove before merging: temporary state rehydration instrumentation
+	RehydrationSamples  int
+	TotalRehydratedObjs int
+	MaxRehydratedObjs   int
+	MinRehydratedObjs   int
+	RehydrationsByRecon map[string]*rehydrationStat
 }
 
 type latencyStat struct {
@@ -28,13 +35,21 @@ type latencyStat struct {
 	Min   time.Duration
 }
 
+type rehydrationStat struct {
+	Count int
+	Total int
+	Max   int
+	Min   int
+}
+
 func NewExploreStats() *ExploreStats {
 	return &ExploreStats{
 		startTime:    nil,
 		AbortedPaths: 0,
 
 		RestartsPerReconciler: make(map[string]int),
 		StepLatencyByRecon:    make(map[string]*latencyStat),
+		RehydrationsByRecon:   make(map[string]*rehydrationStat),
 	}
 }
 
@@ -69,13 +84,46 @@ func (s *ExploreStats) RecordStep(reconcilerID string, d time.Duration) {
 	}
 }
 
+// TODO remove before merging: temporary instrumentation for rehydration volume
+func (s *ExploreStats) RecordRehydration(reconcilerID string, objs int) {
+	s.RehydrationSamples++
+	s.TotalRehydratedObjs += objs
+	if s.RehydrationSamples == 1 || objs > s.MaxRehydratedObjs {
+		s.MaxRehydratedObjs = objs
+	}
+	if s.RehydrationSamples == 1 || objs < s.MinRehydratedObjs {
+		s.MinRehydratedObjs = objs
+	}
+
+	rs, ok := s.RehydrationsByRecon[reconcilerID]
+	if !ok {
+		rs = &rehydrationStat{Min: objs, Max: objs}
+		s.RehydrationsByRecon[reconcilerID] = rs
+	}
+	rs.Count++
+	rs.Total += objs
+	if objs > rs.Max {
+		rs.Max = objs
+	}
+	if objs < rs.Min {
+		rs.Min = objs
+	}
+}
+
 func (ls *latencyStat) Avg() time.Duration {
 	if ls == nil || ls.Count == 0 {
 		return 0
 	}
 	return time.Duration(int64(ls.Total) / int64(ls.Count))
 }
 
+func (rs *rehydrationStat) Avg() float64 {
+	if rs == nil || rs.Count == 0 {
+		return 0
+	}
+	return float64(rs.Total) / float64(rs.Count)
+}
+
 func (s *ExploreStats) Print() {
 	if s.endTime == nil {
 		s.Finish()
@@ -84,6 +132,10 @@ func (s *ExploreStats) Print() {
 	if s.TotalReconcileSteps > 0 {
 		avgStep = time.Duration(int64(s.TotalStepLatency) / int64(s.TotalReconcileSteps))
 	}
+	var avgRehydrated float64
+	if s.RehydrationSamples > 0 {
+		avgRehydrated = float64(s.TotalRehydratedObjs) / float64(s.RehydrationSamples)
+	}
 	if logger.GetSink() != nil {
 		logger.Info("explore stats",
 			"totalTime", s.endTime.Sub(*s.startTime),
@@ -94,6 +146,10 @@ func (s *ExploreStats) Print() {
 			"avgStepLatency", avgStep,
 			"maxStepLatency", s.MaxStepLatency,
 			"stepLatencyByReconciler", s.latencyByReconcilerSummary(),
+			"avgRehydratedObjs", avgRehydrated,
+			"maxRehydratedObjs", s.MaxRehydratedObjs,
+			"minRehydratedObjs", s.MinRehydratedObjs,
+			"rehydrationByReconciler", s.rehydrationByReconcilerSummary(),
 		)
 		// return
 	}
@@ -104,8 +160,13 @@ func (s *ExploreStats) Print() {
 	fmt.Printf("Reconcile steps: %d\n", s.TotalReconcileSteps)
 	fmt.Printf("Avg step latency: %v\n", avgStep)
 	fmt.Printf("Max step latency: %v\n", s.MaxStepLatency)
+	fmt.Printf("Avg rehydrated objs: %.2f\n", avgRehydrated)
+	fmt.Printf("Max rehydrated objs: %d\n", s.MaxRehydratedObjs)
+	fmt.Printf("Min rehydrated objs: %d\n", s.MinRehydratedObjs)
 	fmt.Println("Step latency by reconciler:")
 	s.printLatencyTable()
+	fmt.Println("Rehydrated objects by reconciler:")
+	s.printRehydrationTable()
 }
 
 func (s *ExploreStats) latencyByReconcilerSummary() map[string]map[string]any {
@@ -137,3 +198,35 @@ func (s *ExploreStats) printLatencyTable() {
 		fmt.Printf("  %-25s %-6d %-12v %-12v %-12v\n", reconID, ls.Count, ls.Avg(), ls.Max, ls.Min)
 	}
 }
+
+// TODO remove before merging: temporary rehydration instrumentation summary
+func (s *ExploreStats) rehydrationByReconcilerSummary() map[string]map[string]any {
+	out := make(map[string]map[string]any, len(s.RehydrationsByRecon))
+	for reconID, rs := range s.RehydrationsByRecon {
+		if rs == nil || rs.Count == 0 {
+			continue
+		}
+		out[reconID] = map[string]any{
+			"count": rs.Count,
+			"avg":   rs.Avg(),
+			"max":   rs.Max,
+			"min":   rs.Min,
+		}
+	}
+	return out
+}
+
+// TODO remove before merging: temporary rehydration instrumentation output
+func (s *ExploreStats) printRehydrationTable() {
+	if len(s.RehydrationsByRecon) == 0 {
+		fmt.Println("  (no rehydration samples)")
+		return
+	}
+	fmt.Printf("  %-25s %-6s %-12s %-12s %-12s\n", "Reconciler", "Count", "Avg", "Max", "Min")
+	for reconID, rs := range s.RehydrationsByRecon {
+		if rs == nil || rs.Count == 0 {
+			continue
+		}
+		fmt.Printf("  %-25s %-6d %-12.2f %-12d %-12d\n", reconID, rs.Count, rs.Avg(), rs.Max, rs.Min)
+	}
+}
diff --git a/specs/explore-performance-investigation.md b/specs/explore-performance-investigation.md
@@ -0,0 +1,40 @@
+# Explorer performance investigation (Knative Serving harness)
+
+## Baseline runs (latest)
+- Depth 2 (`--emit-stats --interactive=false`): 1.29s total, 135 reconcile steps, average step latency 9.03ms (max 38.76ms), 128 aborted paths.【77f9f8†L4-L20】
+- Depth 3 (`--emit-stats --interactive=false --timeout=30s`): 30.01s total, 597 reconcile steps, average step latency 49.45ms (max 135.45ms), 590 aborted paths.【e43743†L12-L28】
+
+## Observations
+- Depth 3 still explores only two more unique states than depth 2 (6 vs 4) but now needs ~4.6x more node visits and ~5.5x slower average reconcile steps, so the slowdown is in per-step setup rather than search breadth.【77f9f8†L4-L20】【e43743†L12-L28】
+- The Knative harness rebuilds informers and waits for sync on every reconcile via `setupClientState`, logging each informer registration and frequently timing out at depth 3 (e.g., KPA informers never sync), which burns most of the per-step budget.【17a8e9†L589-L629】【e43743†L2-L11】
+- Cumulative CPU profiling at depth 3 shows ~37% of sampled time inside `PrepareState`/`setupClientState` and another ~25% under Knative injection helpers wiring informers, confirming state rehydration dominates reconcile latency.【136690†L1-L20】
+- Rehydration volume grows with depth: depth 2 averaged 3.87 objects per reconcile (max 4), while depth 3 averaged 5.69 objects (max 7) as the graph accumulates KPA/Deployment/ReplicaSet state. That extra seeding work correlates with the 6x jump in average step latency (7ms → 48ms) even before informer sync errors kick in.【c50437†L137-L170】【38d5fa†L110-L146】
+
+### Why does a 1.5x rehydration bump lead to ~6x slower steps?
+- The informer fan-out multiplies per-object work. Each logical object is replicated into 3–4 informers (client create, informer event, lister cache), so a 1.5x bump in logical objects translates into ~5x more queue events to drain before `RunAndSyncInformers` returns. That squares poorly with 10+ informers per reconciler, pushing cache syncs toward timeouts even though the logical state change is small.【F:examples/knative-serving/knative/harness.go†L589-L629】【38d5fa†L110-L146】
+- Depth 3 also schedules ~4.6x more reconcile steps than depth 2 (597 vs 135), so the goroutines launched for informer factories stack up while earlier caches are still running. The extra concurrent factories contend for the shared fake client and heap, slowing later steps disproportionately versus the modest state growth.【77f9f8†L4-L20】【e43743†L12-L28】【F:pkg/tracecheck/explore.go†L482-L581】
+- The sync path is cold every time: we rebuild factories each step, so none of the cache-level CPU work benefits from warm state. Small per-informer startup costs (watch channel wiring, indexer allocation, reflector goroutines) become the dominant term once we repeat them hundreds of times, explaining why latencies balloon even though the rehydrated objects only grow 1.5x.【F:examples/knative-serving/knative/harness.go†L589-L629】【F:pkg/tracecheck/reconciler.go†L145-L186】
+
+## Why does overhead worsen with depth?
+- Every reconcile step rehydrates the fake cluster: `ReconcilerContainer.doReconcile` calls the strategy’s `PrepareState`, which in the Knative harness spins up a fresh informer set, seeds every object in the observable state, and waits for caches to sync before running the controller.【F:pkg/tracecheck/reconciler.go†L145-L186】【F:examples/knative-serving/knative/harness.go†L232-L242】【F:examples/knative-serving/knative/harness.go†L589-L629】
+- As depth grows, each branch’s observable state contains more reconciler-created objects, so `insertObjects` performs more client creates and the informers have a larger backlog to process and sync on every step.【F:examples/knative-serving/knative/harness.go†L632-L729】
+- Depth also increases the number of branches (due to pending reconcile permutations), so these full rehydration cycles happen more frequently; the cumulative goroutines and cache-warming work make later steps slower even though the unique explored states barely increase.【F:pkg/tracecheck/explore.go†L482-L581】【d5d564†L26-L38】【cf69aa†L14-L34】
+
+## Potential incremental informer strategies
+- **Reuse informer caches per reconciler:** Thread a cache layer through `KnativeStrategy` so informer factories are keyed by reconciler + selector set and survive across reconcile steps. Coupled with a per-branch snapshot of object data, this would avoid the repeated `SetupInformers`/`RunAndSyncInformers` churn that dominates depth-3 profiles.【F:examples/knative-serving/knative/harness.go†L232-L242】【F:examples/knative-serving/knative/harness.go†L589-L629】 To preserve branch isolation while keeping caches warm, keep one cache instance per branch and per reconciler; when a branch forks, clone the cache state (e.g., via a shallow copy of the shared indexer store plus a per-branch event log) and apply only the child branch’s deltas. Controllers would then see informer events that mirror real watch behavior (adds/updates/deletes) while their listers stay resident across steps in that branch. That keeps isolation strict (no cross-branch mutations) yet avoids cold-starting factories unless a brand-new reconciler appears—that “appearance” is when a previously unused controller first shows up in a branch’s pending worklist (e.g., a new controller type is enqueued for the first time because of branching or new resource kinds), so the cache layer can memoize and reuse for all subsequent invokes within that branch.
+- **Cache cloning cost and event-log purpose:** The per-branch cache clone would copy the informer stores (or their serialized contents) whenever a branch forks. Even if we snapshot via shallow copy, the cost is proportional to the number of objects tracked by that reconciler’s informers; with dozens of branches this clone overhead could matter. The per-branch event log is meant to avoid deep copies: record the sequence of add/update/delete events emitted during a reconcile step, and on fork replay that log into the child cache so only deltas are materialized. That keeps cloning closer to O(changed objects) instead of O(all objects) while preserving the exact informer event order the parent observed.
+- **Warm caches from diffs instead of full reinsert:** Track the delta between successive observable states (e.g., via the new rehydration counters) and only apply create/update/delete operations to the fake clients. Controllers would still see fresh informers, but sync work scales with mutations instead of total objects.【F:pkg/tracecheck/stats.go†L9-L120】【F:pkg/tracecheck/explore.go†L651-L674】
+- **Adopt shared fake clients:** Where controller behavior allows, re-use a single fake clientset per controller instance and “rewind” its objects between steps by applying state deltas. This would sidestep informer construction entirely but requires careful isolation to prevent cross-branch leakage.
+- **Hybrid: lazy informer bring-up:** Delay informer creation until a branch first schedules a reconcile for that controller, then keep the informer/live caches resident for subsequent steps in that branch. That reduces churn on branches that die early while still limiting goroutine growth.
+
+### How these strategies relate
+- **Cache reuse + event-log cloning** is a foundation: per-branch caches with event logs provide the isolation and warm-state behavior both the shared-client and lazy-bring-up ideas depend on.
+- **Warm-from-diffs** can layer on top of cache reuse or stand alone; it reduces the work needed to seed caches regardless of whether they persist across steps.
+- **Shared fake clients** is more intrusive and partially overlaps with cache reuse. It can replace informer reuse entirely but must still honor branch-isolated snapshots (potentially still using event logs for rewind).
+- **Lazy bring-up** complements the above: it defers the cost until a reconciler first appears, but once initialized, it benefits from whichever cache-reuse or diffing mechanism is chosen.
+
+## Hypotheses and candidate fixes
+1. **Reuse informers per reconciler**: memoize the informer set returned by `setupClientState` (keyed by selectors + scheme) and share it across reconcile steps/branches instead of rebuilding, so cache priming happens once.
+2. **Cache prepared state snapshots**: cache the client state for identical `StateNode` contents and restore from a clone instead of re-inserting objects and re-syncing informers each time.
+3. **Reduce redundant reconciler triggers**: dedupe repeated reconcile requests for the same resource/version within a state to avoid cycling through PrepareState for branches that do not advance the state hash.
+4. **Profile per-step hotspots**: use the new pprof listener on `localhost:6060` to capture CPU/heap profiles across depth 2 vs depth 3 runs and confirm how much time is spent in informer setup and client seeding.【F:examples/knative-serving/main.go†L16-L23】