From 0b3124e32e4461d3d0f69f76ea56058d95ffacef Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Fri, 20 Feb 2026 18:52:43 -0800 Subject: [PATCH 01/29] add user controller type --- pkg/tracecheck/user_controller.go | 149 +++++++++++++++++++++++++ pkg/tracecheck/user_controller_test.go | 123 ++++++++++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 pkg/tracecheck/user_controller.go create mode 100644 pkg/tracecheck/user_controller_test.go diff --git a/pkg/tracecheck/user_controller.go b/pkg/tracecheck/user_controller.go new file mode 100644 index 0000000..7284582 --- /dev/null +++ b/pkg/tracecheck/user_controller.go @@ -0,0 +1,149 @@ +package tracecheck + +import ( + "context" + "fmt" + "strconv" + + "github.com/tgoodwin/kamera/pkg/event" + "github.com/tgoodwin/kamera/pkg/replay" + "github.com/tgoodwin/kamera/pkg/util" + "k8s.io/apimachinery/pkg/runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +const UserControllerID ReconcilerID = "UserController" + +type UserAction struct { + ID string `json:"id"` + OpType event.OperationType `json:"type"` + Payload runtime.Object `json:"payload"` +} + +type UserController struct { + effectContextManager EffectContextManager + container *ReconcilerContainer + reconciler *userActionReconciler +} + +type userActionClient interface { + client.Client +} + +type userActionReconciler struct { + actions []UserAction + client userActionClient +} + +func (r *userActionReconciler) HasActionAt(actionIdx int) bool { + return actionIdx >= 0 && actionIdx < len(r.actions) +} + +func (r *userActionReconciler) actionCount() int { + return len(r.actions) +} + +func (r *userActionReconciler) actionForRequest(req reconcile.Request) (UserAction, error) { + actionIdx, err := strconv.Atoi(req.Name) + if err != nil { + return UserAction{}, fmt.Errorf("invalid user action request index %q: %w", req.Name, err) + } + if !r.HasActionAt(actionIdx) { + return UserAction{}, fmt.Errorf("user action index %d out of range (len=%d)", actionIdx, len(r.actions)) + } + return r.actions[actionIdx], nil +} + +func (r *userActionReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + action, err := r.actionForRequest(req) + if err != nil { + return reconcile.Result{}, err + } + if r.client == nil { + return reconcile.Result{}, fmt.Errorf("user action client is not configured") + } + + obj, ok := action.Payload.(client.Object) + if !ok || obj == nil { + return reconcile.Result{}, fmt.Errorf("user action %q payload must implement client.Object", action.ID) + } + + switch action.OpType { + case event.CREATE: + if err := r.client.Create(ctx, obj); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil + case event.UPDATE: + if err := r.client.Update(ctx, obj); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil + case event.MARK_FOR_DELETION: + if err := r.client.Delete(ctx, obj); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil + default: + return reconcile.Result{}, fmt.Errorf("unsupported user action op type %q", action.OpType) + } +} + +func NewUserController( + effectContextManager EffectContextManager, + container *ReconcilerContainer, + reconciler *userActionReconciler, +) *UserController { + return &UserController{ + effectContextManager: effectContextManager, + container: container, + reconciler: reconciler, + } +} + +func (u *UserController) HasActionAt(actionIdx int) bool { + return u != nil && u.reconciler != nil && u.reconciler.HasActionAt(actionIdx) +} + +func (u *UserController) ExecuteNextAction(ctx context.Context, observableState ObjectVersions, actionIdx int) (*ReconcileResult, error) { + if u == nil { + return nil, fmt.Errorf("user controller is nil") + } + if !u.HasActionAt(actionIdx) { + total := 0 + if u.reconciler != nil { + total = u.reconciler.actionCount() + } + return nil, fmt.Errorf("user action index %d out of range (len=%d)", actionIdx, total) + } + if u.effectContextManager == nil { + return nil, fmt.Errorf("user controller effect context manager is nil") + } + if u.container == nil { + return nil, fmt.Errorf("user controller container is nil") + } + if u.reconciler == nil { + return nil, fmt.Errorf("user controller reconciler is nil") + } + + frameID := util.UUID() + stepCtx := replay.WithFrameID(ctx, frameID) + if err := u.effectContextManager.PrepareEffectContext(stepCtx, observableState); err != nil { + return nil, err + } + defer u.effectContextManager.CleanupEffectContext(stepCtx) + + req := reconcile.Request{} + req.Name = strconv.Itoa(actionIdx) + result, err := u.container.doReconcile(stepCtx, observableState, req) + if err != nil { + return &ReconcileResult{ + ControllerID: UserControllerID, + FrameID: frameID, + FrameType: FrameTypeExplore, + Error: err.Error(), + }, err + } + return result, nil +} diff --git a/pkg/tracecheck/user_controller_test.go b/pkg/tracecheck/user_controller_test.go new file mode 100644 index 0000000..d576eb5 --- /dev/null +++ b/pkg/tracecheck/user_controller_test.go @@ -0,0 +1,123 @@ +package tracecheck + +import ( + "context" + "reflect" + "testing" + + "github.com/tgoodwin/kamera/pkg/event" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +func TestUserAction_DataOnlyShape(t *testing.T) { + typeInfo := reflect.TypeOf(UserAction{}) + if typeInfo.NumField() != 3 { + t.Fatalf("expected UserAction to have exactly 3 fields, got %d", typeInfo.NumField()) + } + + required := map[string]bool{ + "ID": false, + "OpType": false, + "Payload": false, + } + + for i := 0; i < typeInfo.NumField(); i++ { + field := typeInfo.Field(i) + if _, ok := required[field.Name]; ok { + required[field.Name] = true + } + if field.Type.Kind() == reflect.Func { + t.Fatalf("UserAction must be data-only; found function field %q", field.Name) + } + } + + for fieldName, found := range required { + if !found { + t.Fatalf("expected UserAction field %q to exist", fieldName) + } + } +} + +func TestExplorerBuild_InitializesUserController(t *testing.T) { + builder := NewExplorerBuilder(runtime.NewScheme()) + explorer, err := builder.Build("standalone") + if err != nil { + t.Fatalf("build explorer: %v", err) + } + + if explorer.userController == nil { + t.Fatalf("expected explorer to own exactly one user controller instance") + } + if _, ok := explorer.userController.container.Strategy.(*ControllerRuntimeStrategy); !ok { + t.Fatalf("expected user controller to reuse ControllerRuntimeStrategy") + } + fromReconcilers, ok := explorer.reconcilers[UserControllerID] + if !ok { + t.Fatalf("expected user controller reconciler to come from instantiateReconcilers map") + } + if fromReconcilers != explorer.userController.container { + t.Fatalf("expected explorer.userController.container to reference reconciler container from instantiateReconcilers") + } +} + +func TestUserController_ExecuteNextAction_ByBranchIndex(t *testing.T) { + builder := NewExplorerBuilder(runtime.NewScheme()) + builder.WithUserActions([]UserAction{{ + ID: "create-default-cm", + OpType: event.CREATE, + Payload: &corev1.ConfigMap{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: "created-by-user-action", + }, + }, + }}) + + explorer, err := builder.Build("standalone") + if err != nil { + t.Fatalf("build explorer: %v", err) + } + + result, err := explorer.userController.ExecuteNextAction(context.Background(), ObjectVersions{}, 0) + if err != nil { + t.Fatalf("execute next action: %v", err) + } + + if result.ControllerID != UserControllerID { + t.Fatalf("expected controller id %q, got %q", UserControllerID, result.ControllerID) + } + if len(result.Changes.Effects) != 1 { + t.Fatalf("expected exactly one effect, got %d", len(result.Changes.Effects)) + } + if result.Changes.Effects[0].OpType != event.CREATE { + t.Fatalf("expected CREATE effect, got %q", result.Changes.Effects[0].OpType) + } + if len(result.Changes.ObjectVersions) != 1 { + t.Fatalf("expected exactly one changed object version, got %d", len(result.Changes.ObjectVersions)) + } + + if _, err := explorer.userController.ExecuteNextAction(context.Background(), ObjectVersions{}, 1); err == nil { + t.Fatalf("expected out-of-range branch index to return an error") + } +} + +func TestUserActionReconciler_Reconcile_SelectsActionByRequestIndex(t *testing.T) { + r := &userActionReconciler{ + actions: []UserAction{ + {ID: "a0", OpType: event.INIT}, + {ID: "a1", OpType: event.INIT}, + }, + } + + action, err := r.actionForRequest(reconcile.Request{NamespacedName: types.NamespacedName{Name: "1"}}) + if err != nil { + t.Fatalf("actionForRequest: %v", err) + } + if action.ID != "a1" { + t.Fatalf("expected action ID a1 from request index 1, got %q", action.ID) + } +} From 6e368ce5b7f19de4b9c05e56b6a7956956aa1c07 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Fri, 20 Feb 2026 18:56:55 -0800 Subject: [PATCH 02/29] jesus I am burnt out on this --- pkg/tracecheck/explore.go | 22 +++++++++ .../explore_staleness_mechanism_test.go | 4 ++ .../explore_user_action_scheduler_test.go | 48 +++++++++++++++++++ pkg/tracecheck/explorebuilder.go | 30 +++++++++++- pkg/tracecheck/reconciler.go | 10 ++-- pkg/tracecheck/state_test.go | 24 ++++++++++ 6 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 pkg/tracecheck/explore_user_action_scheduler_test.go diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go index 8b42dc4..7b7c3fe 100644 --- a/pkg/tracecheck/explore.go +++ b/pkg/tracecheck/explore.go @@ -122,6 +122,8 @@ type Explorer struct { priorityHandler PriorityHandler // prioritize possible views to explore + userController *UserController + Config *ExploreConfig stats *ExploreStats @@ -138,6 +140,16 @@ func (e *Explorer) Stats() *ExploreStats { return e.stats } +func (e *Explorer) shouldApplyNextUserAction(state StateNode) bool { + if e == nil || e.userController == nil { + return false + } + if !e.userController.HasActionAt(state.nextUserActionIdx) { + return false + } + return len(state.PendingReconciles) == 0 || allPendingIgnorableForConvergence(state.PendingReconciles) +} + // Objects resolves and returns all objects for the provided ResultState, skipping any that cannot be resolved. func (e *Explorer) Objects(rs ResultState) []*unstructured.Unstructured { if e == nil || e.versionManager == nil { @@ -732,6 +744,14 @@ func (e *Explorer) explore( return nil } + shouldApplyUserAction := e.shouldApplyNextUserAction(currentState) + if shouldApplyUserAction && logger.V(2).Enabled() { + logger.V(2).WithValues( + "Depth", currentState.depth, + "NextUserActionIdx", currentState.nextUserActionIdx, + ).Info("user action scheduler selected next action") + } + // A state is considered converged if: // 1. There are no pending reconciles, OR // 2. All remaining pending reconciles are ignorable for convergence (async enqueues @@ -976,6 +996,7 @@ func (e *Explorer) explore( divergenceKey: stateView.divergenceKey, stuckReconcilerPositions: maps.Clone(stateView.stuckReconcilerPositions), ExecutionHistory: append(currHistory, stepResult), + nextUserActionIdx: stateView.nextUserActionIdx, } newState.ID = string(newState.Hash()) @@ -1577,6 +1598,7 @@ func (e *Explorer) getPossibleViewsForReconcile(currState StateNode, reconcilerI parent: currState.parent, action: currState.action, ExecutionHistory: slices.Clone(currState.ExecutionHistory), + nextUserActionIdx: currState.nextUserActionIdx, divergenceKey: divergenceHash, diff --git a/pkg/tracecheck/explore_staleness_mechanism_test.go b/pkg/tracecheck/explore_staleness_mechanism_test.go index dffc49e..db6b0ec 100644 --- a/pkg/tracecheck/explore_staleness_mechanism_test.go +++ b/pkg/tracecheck/explore_staleness_mechanism_test.go @@ -21,6 +21,7 @@ func TestGetPossibleViewsForReconcile_ContainsStrictlyStaleView(t *testing.T) { newTestStateEvent(1, group, kind, "pa-v1"), newTestStateEvent(2, group, kind, "pa-v2"), ) + state.nextUserActionIdx = 3 explorer := &Explorer{ Config: &ExploreConfig{ @@ -53,6 +54,9 @@ func TestGetPossibleViewsForReconcile_ContainsStrictlyStaleView(t *testing.T) { currentSeq := state.Contents.KindSequences[kindKey] foundStrictlyStale := false for _, v := range views { + if v.nextUserActionIdx != state.nextUserActionIdx { + t.Fatalf("expected stale view to preserve nextUserActionIdx=%d, got %d", state.nextUserActionIdx, v.nextUserActionIdx) + } if v.Contents.KindSequences[kindKey] < currentSeq { foundStrictlyStale = true break diff --git a/pkg/tracecheck/explore_user_action_scheduler_test.go b/pkg/tracecheck/explore_user_action_scheduler_test.go new file mode 100644 index 0000000..302bda5 --- /dev/null +++ b/pkg/tracecheck/explore_user_action_scheduler_test.go @@ -0,0 +1,48 @@ +package tracecheck + +import ( + "testing" + + "github.com/tgoodwin/kamera/pkg/event" +) + +func TestShouldApplyNextUserAction_QuiescencePolicy(t *testing.T) { + explorer := &Explorer{ + userController: &UserController{ + reconciler: &userActionReconciler{actions: []UserAction{{ID: "a1", OpType: event.CREATE}}}, + }, + } + + if !explorer.shouldApplyNextUserAction(StateNode{}) { + t.Fatalf("expected user action to be schedulable when branch is quiescent and action remains") + } + + nonQuiescent := StateNode{ + PendingReconciles: []PendingReconcile{{ReconcilerID: "ServiceController", Source: SourceStateChange}}, + } + if explorer.shouldApplyNextUserAction(nonQuiescent) { + t.Fatalf("expected false when branch has actionable pending reconciles") + } + + ignorableOnly := StateNode{ + PendingReconciles: []PendingReconcile{{ReconcilerID: "ticker", Source: SourceAsyncEnqueue}}, + } + if !explorer.shouldApplyNextUserAction(ignorableOnly) { + t.Fatalf("expected true when only ignorable pending reconciles remain") + } + + if explorer.shouldApplyNextUserAction(StateNode{nextUserActionIdx: 1}) { + t.Fatalf("expected false when no remaining user actions for branch index") + } +} + +func TestShouldApplyNextUserAction_NoUserControllerOrActions(t *testing.T) { + if (&Explorer{}).shouldApplyNextUserAction(StateNode{}) { + t.Fatalf("expected false when explorer has no user controller") + } + + explorer := &Explorer{userController: &UserController{}} + if explorer.shouldApplyNextUserAction(StateNode{}) { + t.Fatalf("expected false when user controller has no remaining actions") + } +} diff --git a/pkg/tracecheck/explorebuilder.go b/pkg/tracecheck/explorebuilder.go index d59d769..48aa8bf 100644 --- a/pkg/tracecheck/explorebuilder.go +++ b/pkg/tracecheck/explorebuilder.go @@ -48,6 +48,8 @@ type ExplorerBuilder struct { // podCrashProbabilities configures random crash probabilities for the // PodLifecycleController. Maps lifecycle stage to crash probability (0.0 to 1.0). podCrashProbabilities map[controller.PodLifecycleStage]float64 + + userActions []UserAction } // ReconcilerBuilder enables chaining reconciler-specific configuration @@ -158,6 +160,7 @@ func (b *ExplorerBuilder) Fork() *ExplorerBuilder { config: cloneExploreConfig(b.config), builder: b.builder, podCrashProbabilities: maps.Clone(b.podCrashProbabilities), + userActions: slices.Clone(b.userActions), } } @@ -439,6 +442,11 @@ func (b *ExplorerBuilder) WithReplayBuilder(builder *replay.Builder) *ExplorerBu return b } +func (b *ExplorerBuilder) WithUserActions(actions []UserAction) *ExplorerBuilder { + b.userActions = slices.Clone(actions) + return b +} + // Config returns a copy of the current builder configuration. func (b *ExplorerBuilder) Config() ExploreConfig { if b.config == nil { @@ -594,6 +602,24 @@ func (b *ExplorerBuilder) instantiateReconcilers(mgr *manager) map[ReconcilerID] return containers } +func (b *ExplorerBuilder) instantiateUserController(mgr *manager) *UserController { + frameManager := replay.NewFrameManager(nil) + replayClient := replay.NewClient( + string(UserControllerID), + b.scheme, + frameManager, + mgr, + ) + + reconciler := &userActionReconciler{actions: slices.Clone(b.userActions), client: replayClient} + container := Wrap(UserControllerID, reconciler, mgr, frameManager, mgr) + return NewUserController( + mgr, + container, + reconciler, + ) +} + // instantiateCleanupReconciler adds a reconciler to the system that handles // actual deletion of resources after they have been "marked" for deletion. In reality, // the APIServer would handle this, but we need to simulate this behavior in our system. @@ -784,6 +810,8 @@ func (b *ExplorerBuilder) Build(modes ...string) (*Explorer, error) { reconcilers := b.instantiateReconcilers(mgr) cleanupReconciler := b.instantiateCleanupReconciler(mgr) reconcilers[cleanupReconcilerID] = cleanupReconciler + userController := b.instantiateUserController(mgr) + reconcilers[UserControllerID] = userController.container // Create knowledge manager if using replay builder var knowledgeManager *EventKnowledge @@ -805,7 +833,6 @@ func (b *ExplorerBuilder) Build(modes ...string) (*Explorer, error) { b.watchers, mgr.versionStore, ) - // Construct the Explorer explorer := &Explorer{ reconcilers: reconcilers, @@ -818,6 +845,7 @@ func (b *ExplorerBuilder) Build(modes ...string) (*Explorer, error) { // for prioritizing 'interesting' (potentially bug-causing) states to explore priorityHandler: b.priorityBuilder.Build(b.snapStore), + userController: userController, } return explorer, nil diff --git a/pkg/tracecheck/reconciler.go b/pkg/tracecheck/reconciler.go index bbae342..18ebdf2 100644 --- a/pkg/tracecheck/reconciler.go +++ b/pkg/tracecheck/reconciler.go @@ -54,7 +54,7 @@ func NewControllerRuntimeStrategy(r reconcile.Reconciler, fi frameInserter, er e func (s *ControllerRuntimeStrategy) PrepareState(ctx context.Context, state []runtime.Object) (context.Context, func(), error) { frameID := replay.FrameIDFromContext(ctx) - frameData := s.toFrameData(state) + frameData := runtimeObjectsToCacheFrame(state, s.scheme) s.InsertCacheFrame(frameID, frameData) cleanup := func() {} return ctx, cleanup, nil @@ -82,16 +82,16 @@ func (s *ControllerRuntimeStrategy) ReconcileAtState(ctx context.Context, name t return s.Reconciler.Reconcile(ctx, req) } -// toFrameData converts a slice of runtime objects into a CacheFrame. -func (s *ControllerRuntimeStrategy) toFrameData(objects []runtime.Object) replay.CacheFrame { +// runtimeObjectsToCacheFrame converts a slice of runtime objects into a replay cache frame. +func runtimeObjectsToCacheFrame(objects []runtime.Object, scheme *runtime.Scheme) replay.CacheFrame { out := make(replay.CacheFrame) for _, obj := range objects { if obj == nil { continue } - if gvk := obj.GetObjectKind().GroupVersionKind(); gvk.Empty() && s.scheme != nil { - if gvks, _, err := s.scheme.ObjectKinds(obj); err == nil && len(gvks) > 0 { + if gvk := obj.GetObjectKind().GroupVersionKind(); gvk.Empty() && scheme != nil { + if gvks, _, err := scheme.ObjectKinds(obj); err == nil && len(gvks) > 0 { obj.GetObjectKind().SetGroupVersionKind(gvks[0]) } } diff --git a/pkg/tracecheck/state_test.go b/pkg/tracecheck/state_test.go index 67c37f3..3ae0725 100644 --- a/pkg/tracecheck/state_test.go +++ b/pkg/tracecheck/state_test.go @@ -182,6 +182,30 @@ func Test_GetUniquePaths(t *testing.T) { } } +func TestStateNodeClone_IsolatesNextUserActionIdxAcrossBranches(t *testing.T) { + parent := StateNode{ + nextUserActionIdx: 2, + PendingReconciles: []PendingReconcile{ + {ReconcilerID: "A", Request: reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "default", Name: "demo"}}}, + }, + } + + left := parent.Clone() + right := parent.Clone() + + left.nextUserActionIdx++ + + if got := parent.nextUserActionIdx; got != 2 { + t.Fatalf("expected parent nextUserActionIdx to remain 2, got %d", got) + } + if got := right.nextUserActionIdx; got != 2 { + t.Fatalf("expected sibling branch nextUserActionIdx to remain 2, got %d", got) + } + if got := left.nextUserActionIdx; got != 3 { + t.Fatalf("expected updated branch nextUserActionIdx to be 3, got %d", got) + } +} + // Test_GetUniquePaths_PreservesConvergenceSteps verifies that paths ending in convergence // are not deduplicated with non-converged paths, even if they have the same controller sequence. // Convergence is tracked via the :converged marker in UniqueKey(), not by preserving no-op steps. From d62a56b6b16c86ff2830a6407602dcec4a7c12f6 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Sat, 21 Feb 2026 14:49:09 -0800 Subject: [PATCH 03/29] add nextUserActionIdx to state --- pkg/tracecheck/explore.go | 1 + pkg/tracecheck/state.go | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go index 7b7c3fe..da01ece 100644 --- a/pkg/tracecheck/explore.go +++ b/pkg/tracecheck/explore.go @@ -147,6 +147,7 @@ func (e *Explorer) shouldApplyNextUserAction(state StateNode) bool { if !e.userController.HasActionAt(state.nextUserActionIdx) { return false } + // policy here is to apply the next user action after the current state has converged return len(state.PendingReconciles) == 0 || allPendingIgnorableForConvergence(state.PendingReconciles) } diff --git a/pkg/tracecheck/state.go b/pkg/tracecheck/state.go index bd8c034..1fed68d 100644 --- a/pkg/tracecheck/state.go +++ b/pkg/tracecheck/state.go @@ -238,6 +238,9 @@ type StateNode struct { depth int + // nextUserActionIdx tracks branch-local progress through ordered user actions. + nextUserActionIdx int + DivergencePoint string // reconcileID of the first divergence // tracks what KindSequences a controller may be "stuck" on @@ -307,6 +310,7 @@ func (sn StateNode) Clone() StateNode { action: sn.action, ExecutionHistory: slices.Clone(sn.ExecutionHistory), depth: sn.depth, + nextUserActionIdx: sn.nextUserActionIdx, DivergencePoint: sn.DivergencePoint, // TODO deprecate divergenceKey: sn.divergenceKey, @@ -391,6 +395,9 @@ func (sn StateNode) serialize(reconcileOrderSensitive bool) string { builder.WriteString(pr.Request.Name) } + builder.WriteString("|u:") + builder.WriteString(strconv.Itoa(sn.nextUserActionIdx)) + return builder.String() } From 8c0079bda545d9c044922184bae97cb5804ee43f Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Sat, 21 Feb 2026 14:59:41 -0800 Subject: [PATCH 04/29] check in beads state --- .beads/issues.jsonl | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index c1ce8d3..4697825 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -16,6 +16,15 @@ {"id":"kamera-7tl","title":"Backward-trace divergence analysis framework","description":"A composable framework for analyzing divergence in kamera dump files, using backward-trace methodology.\n\n## Design Philosophy\n\nStart from known divergence (final state differences) and trace backwards to root cause. This avoids noise from 'diamond' patterns where paths diverge then reconverge.\n\n## Architecture\n\nThree composable modules in pkg/analysis/:\n\n**Module 0: Converged State Diff** (kamera-pwm)\n- Input: dump.jsonl\n- Output: Objects that differ between converged states\n- Answers: 'In what ways do the final states differ?'\n\n**Module 1: Last Write Analysis** (new task)\n- Input: dump.jsonl + differing object keys\n- Output: For each object+path, the step that produced its final value + reconciler's input state\n- Answers: 'What did the reconciler see when it wrote this value?'\n\n**Module 2: Object Lifecycle Analysis** (kamera-gwc)\n- Input: dump.jsonl + path + object + target hash\n- Output: All steps where object had that hash\n- Answers: 'Does this object take on this value at some point in this path?'\n\n## Composition Flow\n\n1. Run Module 0 to identify differing objects\n2. Run Module 1 on each differing object to find last writes and input diffs\n3. If inputs differ, run Module 2 to check 'does the missing state appear later?'\n4. If yes, points to watch/trigger issue; if no, points to state dependency\n\n## Package Structure\n\npkg/analysis/\n├── types.go # Shared types (moved from pkg/interactive)\n├── dump.go # Dump loading utilities \n├── diff.go # Module 0\n├── lastwrite.go # Module 1\n├── lifecycle.go # Module 2\n\n## Validation\n\nUse trial-1 dump as test fixture - should identify EndpointsController/Pod.Ready issue.","status":"open","priority":1,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T14:06:33.625361-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T14:06:33.625361-08:00"} {"id":"kamera-8ro","title":"Add step-level metadata to dump output","description":"Enhance dump.jsonl with additional per-step metadata useful for analysis:\n- Add 'triggeredBy' field showing what caused this reconciler to run\n- Add 'stateHashBefore'/'stateHashAfter' for quick comparison\n- Include pending queue snapshot at each step (currently only at end)\n- Add timing/depth information\n\nThis makes it easier to understand why reconcilers ran in a particular order.","status":"open","priority":3,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:48.366206-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:27:48.366206-08:00"} {"id":"kamera-917","title":"Assess if completedPathDedup optimization is subsumed by SubtreeCompletion","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T12:57:19.74152-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T12:57:19.74152-08:00","dependencies":[{"issue_id":"kamera-917","depends_on_id":"kamera-qrs","type":"parent-child","created_at":"2026-01-13T12:58:51.164052-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-9gz","title":"UserController first-class workflow integration in tracecheck","description":"Implement the design in `docs/plans/2026-02-21-user-controller-workflow-design.md` so user workflows become first-class in `pkg/tracecheck/explore.go`.\n\nScope for this epic:\n- model one `UserController` actor per `Explorer`\n- execute ordered user actions interleaved with reconcile expansion\n- track branch-local user progress via `StateNode.nextUserActionIdx`\n- preserve replay/effect semantics and trigger behavior\n- make user steps visible in history/dumps and convergence checks\n","acceptance_criteria":"- All child tasks are merged with tests passing via `go test ./pkg/tracecheck/...` and relevant analyze/inspect packages.\n- Exploration can represent user actions as in-engine steps, not just outer scenario phases.\n- Terminal convergence requires both reconcile quiescence and no remaining user actions.","status":"open","priority":1,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:31:30.892404-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:31:30.892404-08:00"} +{"id":"kamera-9gz.1","title":"Resolve v1 policy decisions for user-action semantics","description":"Turn the three open questions in the design doc into explicit v1 policies and lock them into implementation guidance:\n1) non-mutating user action behavior,\n2) history frame representation for user steps,\n3) branch behavior when multiple states are quiescent.\n\nDocument decisions in the design doc and align acceptance tests to those decisions.\n","acceptance_criteria":"- `docs/plans/2026-02-21-user-controller-workflow-design.md` contains resolved v1 decisions for all open questions.\n- Chosen policy is reflected as concrete expected behavior statements used by implementation/tests.\n- Any intentionally deferred choices are explicitly labeled `post-v1`.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:31:41.229276-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:49:21.348767-08:00","closed_at":"2026-02-20T16:49:21.348767-08:00","close_reason":"Closed","dependencies":[{"issue_id":"kamera-9gz.1","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:31:41.230387-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-9gz.2","title":"Add UserAction/UserController types in pkg/tracecheck","description":"Add user workflow data model types in `pkg/tracecheck`:\n- `UserAction` with data-only fields (`id`, `type`, `payload`)\n- `UserController` attached to each `Explorer` with ordered action list and branch-index-based execution API\n\n`UserController` should have an internal list of UserActions and each time an action is taken, we apply one of these actions. The apply strategy should use the `replay.Client`\ninfrastructure, similar to how controller-runtime controllers in kamera make\nAPI writes through replay.Client. That way, we get effect tracking for free.\n","acceptance_criteria":"- `UserAction` is data-only and compiles without function/closure fields.\n- `Explorer` owns exactly one `UserController` instance for the run.\n- `UserController` API supports \"execute next action by branch index\" and returns normal change/effect result shape.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:31:51.464481-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T17:20:44.121883-08:00","closed_at":"2026-02-20T17:20:44.121883-08:00","close_reason":"Implemented UserAction/UserController model and Explorer wiring with tracecheck tests","dependencies":[{"issue_id":"kamera-9gz.2","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:31:51.466041-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-9gz.3","title":"Add branch-local nextUserActionIdx state plumbing","description":"Extend `StateNode` with branch-local user workflow progress (`nextUserActionIdx`) and wire clone/copy/plumbing so branch state evolves independently.\n","acceptance_criteria":"- `StateNode` includes `nextUserActionIdx int` with zero default.\n- All clone/copy/fork paths preserve and isolate this field per branch.\n- Tests verify incrementing on one branch does not mutate sibling branch progress.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:01.805306-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T17:23:40.784199-08:00","closed_at":"2026-02-20T17:23:40.784199-08:00","close_reason":"Added branch-local nextUserActionIdx state plumbing with clone/copy preservation tests","dependencies":[{"issue_id":"kamera-9gz.3","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:01.806828-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-9gz.4","title":"Add shouldApplyNextUserAction scheduler abstraction","description":"Introduce scheduler abstraction in `Explorer`:\n- `shouldApplyNextUserAction(state StateNode) bool`\n- initial internal policy: apply when branch is quiescent\n\nKeep outer explore loop stable so future scheduling policies can plug in.\n","acceptance_criteria":"- `shouldApplyNextUserAction` exists and is called from the explore step loop.\n- Quiescence-only v1 policy is encapsulated behind this method.\n- Unit tests validate true/false behavior for quiescent vs non-quiescent states.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:12.07745-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T17:25:11.529123-08:00","closed_at":"2026-02-20T17:25:11.529123-08:00","close_reason":"Added shouldApplyNextUserAction quiescence scheduler abstraction, loop call, and unit tests","dependencies":[{"issue_id":"kamera-9gz.4","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:12.078869-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-9gz.5","title":"Integrate user-action step path into explore loop","description":"Integrate a dedicated user-action execution branch into `pkg/tracecheck/explore.go` before terminal classification:\n- execute one user action\n- apply replay/effects through existing pipeline\n- derive triggered reconciles through existing trigger manager\n- update pending reconciles and successor state\n- increment `nextUserActionIdx`\n- enqueue successor and continue\n","acceptance_criteria":"- User action execution happens in the main explore loop before terminal classification.\n- Resulting writes/effects use the same recording/application path as controller reconcile steps.\n- Trigger fanout and pending reconcile updates are produced from existing trigger manager logic.\n- Successor state increments `nextUserActionIdx` exactly once per successful action step.","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:22.25174-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:32:22.25174-08:00","dependencies":[{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:22.253717-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.1","type":"blocks","created_at":"2026-02-20T16:33:03.279095-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.2","type":"blocks","created_at":"2026-02-20T16:33:23.675835-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.3","type":"blocks","created_at":"2026-02-20T16:33:33.940463-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.4","type":"blocks","created_at":"2026-02-20T16:33:44.201368-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-9gz.6","title":"Require no remaining user actions for terminal convergence","description":"Update convergence classification so a state is terminal only when:\n- no actionable pending reconciles, and\n- no remaining user actions for that branch.\n","acceptance_criteria":"- Convergence gate checks both reconcile quiescence and user-action exhaustion.\n- States with remaining user actions are not classified as converged.\n- Regression tests cover both converged and non-converged edge cases.","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:32.593171-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:32:32.593171-08:00","dependencies":[{"issue_id":"kamera-9gz.6","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:32.59476-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.6","depends_on_id":"kamera-9gz.5","type":"blocks","created_at":"2026-02-20T16:33:54.459417-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-9gz.7","title":"Add user-action history metadata and mutating-action guardrail","description":"Ensure user action observability and guardrails:\n- append synthetic history step with `ControllerID=\"UserController\"`\n- emit required metadata for dumps/inspector explainability\n\nThis should just follow the same pattern as existing controller write paths.\n- e.g. `RecordEffect` -- if we just use the `replay.Client` then we get this\n for free.\n","acceptance_criteria":"- `ExecutionHistory` includes user-action steps labeled `ControllerID=\"UserController\"`.\n- Dump/inspector output contains enough metadata to identify which user action ran.\n- Non-mutating user actions are detected and handled per resolved v1 policy with explicit test coverage.","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:42.793492-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:43:07.13467-08:00","dependencies":[{"issue_id":"kamera-9gz.7","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:42.79491-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.7","depends_on_id":"kamera-9gz.1","type":"blocks","created_at":"2026-02-20T16:33:13.429091-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.7","depends_on_id":"kamera-9gz.5","type":"blocks","created_at":"2026-02-20T16:34:04.717193-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-9gz.8","title":"Add tracecheck tests for user-action scheduling and fanout semantics","description":"Add focused tests covering user workflow semantics in `pkg/tracecheck`:\n- quiescence scheduling behavior\n- user-action-triggered reconcile fanout\n- branch-local index progression\n- mutating-action invariant behavior\n","acceptance_criteria":"- Test suite includes deterministic cases for all four behavior classes listed above.\n- Tests fail on regressions in scheduling, fanout, progression, or no-op invariant handling.\n- `go test ./pkg/tracecheck/...` passes with the new coverage.","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:53.063728-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:32:53.063728-08:00","dependencies":[{"issue_id":"kamera-9gz.8","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:53.065158-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.8","depends_on_id":"kamera-9gz.5","type":"blocks","created_at":"2026-02-20T16:34:14.95259-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.8","depends_on_id":"kamera-9gz.6","type":"blocks","created_at":"2026-02-20T16:34:25.197422-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.8","depends_on_id":"kamera-9gz.7","type":"blocks","created_at":"2026-02-20T16:34:35.453716-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-aa8","title":"Add reconciler timeline to dump output","description":"Enhance dump.jsonl or analysis tools to produce a reconciler timeline visualization:\n- Side-by-side view of which reconciler runs at each step in each path\n- Highlight where paths diverge in reconciler ordering\n- Show effect counts per step\n- Optional: ASCII art or mermaid diagram output\n\nExample:\nStep | Path 0 | Path 1\n-----|---------------------|---------------------\n 6 | RevisionReconciler | RevisionReconciler\n 7 | RouteReconciler | RouteReconciler\n 12 | KPA | RevisionReconciler \u003c-- diverge","status":"open","priority":3,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:35.173667-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:27:35.173667-08:00"} {"id":"kamera-ata","title":"Execution-guided input generation (coverage strategy v2)","description":"Redesign the coverage/input generation pipeline from a fully-static approach to an execution-guided (closed-loop) approach. See docs/design/coverage-strategy-v2.md. The key change: instead of fabricating all initial state statically, we (1) generate seeds from user-facing CRDs only, (2) derive perturbation profiles statically from hotspot analysis, (3) run a deterministic baseline per seed, (4) scan the baseline trace for interesting checkpoints, and (5) branch from checkpoints with perturbations applied. This replaces the original plan of fabricating intermediate objects and event histories before execution.","status":"tombstone","priority":1,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-02-11T19:43:46.329542-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T10:00:16.435562-08:00"} {"id":"kamera-b1r","title":"Add StateBefore comparison to analysis + explore readset instrumentation","description":"## Problem\n\nThe last write analysis tells us WHO wrote a differing object, but not WHY they wrote different values. To understand the root cause, we need to compare what each controller SAW (StateBefore) when it made its write.\n\nCurrently this requires using the Go API directly - the CLI doesn't expose StateBefore diffs.\n\n## Short-term: CLI Enhancement\n\nAdd StateBefore comparison to the report output:\n- For each differing object, show what the last-writing controller saw in each path\n- Highlight differences in StateBefore that might explain the divergent writes\n\n## Longer-term: Readset Instrumentation\n\nCurrently we capture writesets via EffectRecorder, but we don't capture readsets (what a controller actually read during reconciliation). StateBefore is a proxy - it's the full state available to the controller - but not what it actually accessed.\n\nConsider instrumenting controllers to capture actual reads:\n- Which objects were Get()'d or List()'d\n- Which fields were accessed\n- This would make \"why did this differ?\" analysis much more precise\n\n## Discussion Points\n\n- How invasive would readset instrumentation be?\n- Could we wrap the client.Client interface to intercept reads?\n- Trade-off: precision vs instrumentation complexity\n- Could be useful for other analyses (dependency inference, minimal state for replay)","status":"open","priority":1,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T17:01:59.480026-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T17:01:59.480026-08:00"} @@ -37,12 +46,12 @@ {"id":"kamera-i0r.4","title":"Add DPOR stats tracking","description":"Track stats for orderings skipped due to commutativity. Add to ExploreStats for visibility into optimization effectiveness.","status":"open","priority":3,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-15T16:24:15.419673-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-15T16:24:15.419673-08:00","dependencies":[{"issue_id":"kamera-i0r.4","depends_on_id":"kamera-i0r","type":"parent-child","created_at":"2026-01-15T16:24:15.42023-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-i0r.4","depends_on_id":"kamera-i0r.3","type":"blocks","created_at":"2026-01-15T16:24:20.238278-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-i5y","title":"Implement closed-loop batch entrypoint for top-level seeds","description":"Implement the closed-loop batch entrypoint for v2 using top-level seeds + perturbation profiles.\n\nReplace old precomputed coverage.Input/scenario conversion assumptions with orchestration that does:\n1) baseline run per seed,\n2) checkpoint scan against profiles,\n3) branch runs from selected checkpoints.\n\nScope includes closing current implementation gaps in example harnesses where --inputs paths are wired but conversion/orchestration is not implemented.\n","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-11T22:06:59.143882-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:20:40.66575-08:00","dependencies":[{"issue_id":"kamera-i5y","depends_on_id":"kamera-t01","type":"blocks","created_at":"2026-02-11T22:07:30.159401-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-i5y","depends_on_id":"kamera-yae","type":"blocks","created_at":"2026-02-11T22:07:30.166191-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-i5y","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-11T22:08:33.722933-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-i8e","title":"Analyze why Active condition is non-deterministically set","description":"RevisionReconciler sets Active condition in some runs but not others. Identify the code path that causes this divergence.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T21:30:33.838402-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T22:34:31.071348-08:00","closed_at":"2026-01-13T22:34:31.071348-08:00","close_reason":"Root cause analysis complete - divergence due to conditional EndpointsController enqueue by ServerlessServiceReconciler. See comments for full analysis. Created kamera-369 for StateHash-based divergence detection improvement.","dependencies":[{"issue_id":"kamera-i8e","depends_on_id":"kamera-4sr","type":"blocks","created_at":"2026-01-13T21:30:38.977205-08:00","created_by":"Tim Goodwin"}],"comments":[{"id":4,"issue_id":"kamera-i8e","author":"Tim Goodwin","text":"Root cause analysis: RevisionReconciler's reconcilePA() fails when Deployment not in lister, causing Active condition to not be set. Need to investigate fake informer sync behavior.","created_at":"2026-01-14T05:37:48Z"},{"id":5,"issue_id":"kamera-i8e","author":"Tim Goodwin","text":"DETAILED FINDINGS:\n\nStep 6 divergence analysis from trial-1 dump:\n- Path 0: RevisionReconciler sets 4 conditions (Active, ContainerHealthy, Ready, ResourcesAvailable)\n- Path 1: RevisionReconciler sets 3 conditions (missing Active)\n- Both paths identical through steps 0-5\n\nCode path analysis:\n1. ReconcileKind runs phases: reconcileDeployment -\u003e reconcileImageCache -\u003e reconcilePA\n2. reconcileDeployment creates Deployment, sets ResourcesAvailable=Unknown, ContainerHealthy=Unknown\n3. reconcilePA reads Deployment from lister - THIS IS WHERE IT DIVERGES\n4. If Deployment found: creates PA, calls PropagateAutoscalerStatus(), sets Active=Unknown\n5. If Deployment NOT found: returns error, Active never set","created_at":"2026-01-14T05:38:58Z"},{"id":6,"issue_id":"kamera-i8e","author":"Tim Goodwin","text":"KEY SOURCE FILES:\n\n~/tmp/gomodcache/knative.dev/serving@v0.46.5/pkg/reconciler/revision/\n- revision.go: ReconcileKind() main entry, runs phase loop (lines 155-163)\n- reconcile_resources.go: reconcilePA() lines 150-197, requires Deployment from lister (line 154)\n\n~/tmp/gomodcache/knative.dev/serving@v0.46.5/pkg/apis/serving/v1/\n- revision_lifecycle.go: PropagateAutoscalerStatus() lines 172-237, sets Active condition\n- Note: Active NOT in revisionCondSet (line 55-58), only set by PropagateAutoscalerStatus","created_at":"2026-01-14T05:39:04Z"},{"id":7,"issue_id":"kamera-i8e","author":"Tim Goodwin","text":"OPEN QUESTIONS:\n\n1. Why does lister sometimes have Deployment, sometimes not?\n - reconcileDeployment creates via fake kubeclient (adds to tracker)\n - But informer indexer may not be updated within same reconcile call\n - Fake informers synced BEFORE reconcile starts in PrepareState()\n\n2. Possible sources of non-determinism in kamera harness:\n - persistentMultiScaler singleton (harness.go lines 192-202) persists across exploration paths\n - persistentEnqueueWrapper also a singleton\n - State may leak between DFS branches\n\n3. Fake informer behavior:\n - setupClientState() calls injection.Fake.SetupInformers() then insertObjects()\n - RunAndSyncInformers() syncs before reconcile\n - Objects created DURING reconcile may not appear in lister\n\nNEXT STEPS:\n- Investigate if fake client tracker updates are reflected in lister within same call\n- Check if singleton state causes different behavior on different exploration paths\n- Add logging/instrumentation to reconcilePA to trace lister behavior","created_at":"2026-01-14T05:39:14Z"},{"id":8,"issue_id":"kamera-i8e","author":"Tim Goodwin","text":"Root cause analysis findings from trial-1:\n\n**Divergence point**: 20nh30fc\n- Path A (EndpointsController first) → terminal 3g88mbzi → ServerlessService.Ready=True\n- Path B (KPA first) → terminal g5fkb30e → ServerlessService.Ready=Unknown (NoHealthyBackends)\n\n**Terminal state differences**:\n- ServerlessService: EndpointsPopulated and Ready conditions differ\n- PodAutoscaler: SKSReady condition differs (propagated from SKS)\n- Revision: Identical (both have Active condition)\n\n**Hypothesis**: When EndpointsController runs before KPA at divergence point, it populates endpoints that SKS observes as healthy. When KPA runs first, SKS checks endpoints before they're populated and gets stuck in NoHealthyBackends state.\n\n**Next step**: Verify by examining state at 20nh30fc and immediate successors.","created_at":"2026-01-14T06:30:57Z"},{"id":9,"issue_id":"kamera-i8e","author":"Tim Goodwin","text":"## Root Cause Analysis Complete\n\n**Summary**: The divergence at ContentsHash 20nh30fc is NOT the true root cause. It's a symptom.\n\n**True root cause**: Different execution order earlier in the paths caused ServerlessServiceReconciler to behave differently:\n- In one path, it re-enqueued EndpointsController \n- In another, it didn't\n\nThis caused paths to reach the same ContentsHash with different pending reconcilers - which the ContentsHash-based DAG hides.\n\n**Why paths don't fan back in**: \nEndpointsController being present/absent at 20nh30fc determines whether it runs before KPA has a chance to check SKS readiness. Without EndpointsController, SKS never sees healthy endpoints and stays in NoHealthyBackends state permanently.\n\n**Methodology learned**:\n1. DAG divergence point is useful but may mask true cause\n2. Compare pending reconcilers at divergence, not just objects\n3. Trace back to find when pending lists diverged\n4. Look for reconcilers that conditionally enqueue based on current state\n\n**See kamera-369 for fix**: Need StateHash-based divergence detection.","created_at":"2026-01-14T06:34:25Z"}]} -{"id":"kamera-iuz","title":"Process-isolated parallel exploration mode","description":"## Problem\n`simclock` relies on global state, so kamera's current goroutine-based `--parallel` mode is unsafe when multiple DFS explorations run concurrently in one process. Refactoring simclock usage across 3rd-party controller code is not practical beyond transparent replacements of `time.Now()`.\n\n## Goal\nIntroduce a process-isolated parallel execution path where a supervisor process runs multiple child processes, each exploring independently with isolated global state.\n\n## Key constraint\nProcess isolation requires serializing exploration inputs rather than sharing in-memory Go state.\n\n## Scope\n- Define an explicit file-based input collection consumed by parallel execution.\n- Move examples toward file-based inputs to unblock process-based fanout.\n- Implement supervisor/child orchestration for process-isolated parallel mode.\n\n## Non-goals\n- Deep simclock refactors in 3rd-party code paths.\n- Fork-style memory cloning semantics in Go.\n","status":"open","priority":1,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T20:53:46.476395-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-17T20:53:46.476395-08:00","labels":["process-isolation"]} +{"id":"kamera-iuz","title":"Process-isolated parallel exploration mode","description":"## Problem\n`simclock` relies on global state, so kamera's current goroutine-based `--parallel` mode is unsafe when multiple DFS explorations run concurrently in one process. Refactoring simclock usage across 3rd-party controller code is not practical beyond transparent replacements of `time.Now()`.\n\n## Goal\nIntroduce a process-isolated parallel execution path where a supervisor process runs multiple child processes, each exploring independently with isolated global state.\n\n## Key constraint\nProcess isolation requires serializing exploration inputs rather than sharing in-memory Go state.\n\n## Scope\n- Define an explicit file-based input collection consumed by parallel execution.\n- Move examples toward file-based inputs to unblock process-based fanout.\n- Implement supervisor/child orchestration for process-isolated parallel mode.\n\n## Non-goals\n- Deep simclock refactors in 3rd-party code paths.\n- Fork-style memory cloning semantics in Go.\n","status":"closed","priority":1,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T20:53:46.476395-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:26:55.571647-08:00","closed_at":"2026-02-20T16:26:55.571647-08:00","close_reason":"Closed","labels":["process-isolation"]} {"id":"kamera-iuz.1","title":"Define file-based input collection contract for parallel runs","description":"Specify the on-disk format and loader contract for parallel exploration inputs (shape, versioning, validation, and failure behavior). Align with existing `examples/karpenter/inputs.json` direction while keeping format evolution possible.\n","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T20:53:56.764664-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-18T14:42:30.164673-08:00","closed_at":"2026-02-18T14:42:30.164674-08:00","labels":["inputs","process-isolation"],"dependencies":[{"issue_id":"kamera-iuz.1","depends_on_id":"kamera-iuz","type":"parent-child","created_at":"2026-02-17T20:53:56.766299-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-iuz.2","title":"Refactor Knative example to consume external inputs file","description":"Replace current hard-coded Knative input construction with file-driven loading (similar to Karpenter's `inputs.json` workflow). Keep behavior equivalent where possible and document how to run the example with provided input artifacts.\n","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T20:54:07.099501-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-18T14:55:57.252293-08:00","closed_at":"2026-02-18T14:55:57.252295-08:00","labels":["inputs","knative","process-isolation"],"dependencies":[{"issue_id":"kamera-iuz.2","depends_on_id":"kamera-iuz","type":"parent-child","created_at":"2026-02-17T20:54:07.100985-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-iuz.2","depends_on_id":"kamera-iuz.1","type":"blocks","created_at":"2026-02-17T20:55:18.733213-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-iuz.3","title":"Add parallel runner support for input collections from file","description":"Update kamera parallel execution entrypoints to consume a collection of serialized inputs from disk, including CLI plumbing and deterministic scenario indexing/result collation across parallel workers.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T20:54:17.376027-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-18T16:07:50.798061-08:00","closed_at":"2026-02-18T16:07:50.798061-08:00","close_reason":"Closed","labels":["inputs","process-isolation"],"dependencies":[{"issue_id":"kamera-iuz.3","depends_on_id":"kamera-iuz","type":"parent-child","created_at":"2026-02-17T20:54:17.377186-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-iuz.3","depends_on_id":"kamera-iuz.1","type":"blocks","created_at":"2026-02-17T20:55:28.965847-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-iuz.4","title":"Implement supervisor-child process orchestration for parallel exploration","description":"Introduce a supervisor process that spawns isolated child kamera processes and coordinates execution/reporting. Ensure each child receives serialized input payloads and returns result artifacts/errors in a structured way.\n","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T20:54:27.582672-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-18T19:25:03.339115-08:00","closed_at":"2026-02-18T19:25:03.339115-08:00","close_reason":"implemented process-isolated supervisor/child orchestration and tests","labels":["process-isolation"],"dependencies":[{"issue_id":"kamera-iuz.4","depends_on_id":"kamera-iuz","type":"parent-child","created_at":"2026-02-17T20:54:27.584083-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-iuz.4","depends_on_id":"kamera-iuz.3","type":"blocks","created_at":"2026-02-17T20:55:39.120551-08:00","created_by":"Tim Goodwin"}],"comments":[{"id":14,"issue_id":"kamera-iuz.4","author":"Tim Goodwin","text":"Implemented process-isolated parallel supervisor/child orchestration in pkg/explore per docs/plans/2026-02-19-process-isolated-parallel-supervisor-design.md. Added new flags --parallel-processes and --parallel-child-index, supervisor fanout via go run . child processes, child input-index selection with single-scenario enforcement, child failure dump context attributes (status/error_phase/error_message), aggregate failure reporting after all children complete, and child mode forcing MaxParallel=1. Verification: go test ./pkg/explore and go test ./... both pass.","created_at":"2026-02-19T03:24:48Z"}]} -{"id":"kamera-iuz.5","title":"Land process-isolated --parallel mode with validation and docs","description":"Wire process orchestration into `--parallel` behavior (or a guarded variant), add regression/integration coverage around simclock safety and result equivalence expectations, and document operational tradeoffs (startup overhead, serialization boundaries, failure handling).\n","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T20:54:37.879665-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-17T20:54:37.879665-08:00","labels":["process-isolation"],"dependencies":[{"issue_id":"kamera-iuz.5","depends_on_id":"kamera-iuz","type":"parent-child","created_at":"2026-02-17T20:54:37.881011-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-iuz.5","depends_on_id":"kamera-iuz.4","type":"blocks","created_at":"2026-02-17T20:55:49.360106-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-iuz.5","depends_on_id":"kamera-iuz.2","type":"blocks","created_at":"2026-02-17T20:55:59.590152-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-iuz.5","title":"Land process-isolated --parallel mode with validation and docs","description":"Wire process orchestration into `--parallel` behavior (or a guarded variant), add regression/integration coverage around simclock safety and result equivalence expectations, and document operational tradeoffs (startup overhead, serialization boundaries, failure handling).\n","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T20:54:37.879665-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:26:47.099037-08:00","closed_at":"2026-02-20T16:26:47.099037-08:00","close_reason":"Closed","labels":["process-isolation"],"dependencies":[{"issue_id":"kamera-iuz.5","depends_on_id":"kamera-iuz","type":"parent-child","created_at":"2026-02-17T20:54:37.881011-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-iuz.5","depends_on_id":"kamera-iuz.4","type":"blocks","created_at":"2026-02-17T20:55:49.360106-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-iuz.5","depends_on_id":"kamera-iuz.2","type":"blocks","created_at":"2026-02-17T20:55:59.590152-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-kse","title":"Define PerturbationProfile type","description":"Define a PerturbationProfile type that captures perturbation intent in terms of canonical controller/resource identifiers from the v2 graph contract.\n\nProfile fields should cover:\n- pattern diagnostics (multi_writer, missing_trigger, diamond, reducer, feedback_cycle),\n- controllers/resources involved,\n- ordering perturbation scope,\n- staleness perturbation scope,\n- exploration budget knobs.\n\nThis replaces the old InputTuning + fabricated-object planning model and should align with scanner and restart wiring requirements.\n","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-11T22:06:59.145207-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:19:59.539284-08:00","dependencies":[{"issue_id":"kamera-kse","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-11T22:08:33.726842-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-kse","depends_on_id":"kamera-yqn.1","type":"blocks","created_at":"2026-02-12T14:21:40.615447-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-lih","title":"Document DAG output interpretation in AGENTS.md","status":"closed","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T21:22:31.270693-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T21:24:03.727473-08:00","closed_at":"2026-01-13T21:24:03.727473-08:00","close_reason":"Added Divergence Analysis section to AGENTS.md with instructions for producing DAG output, interpreting DOT format, and cross-referencing hashes"} {"id":"kamera-pwm","title":"Create converged state diff report tool","description":"**Module 0 of backward-trace divergence analysis framework (kamera-7tl)**\n\n## Purpose\nAnswer: 'In what ways do the converged states differ?'\n\n## Input\n- dump.jsonl file path\n\n## Algorithm\n1. Load all converged states from dump\n2. For each state, build map of ObjectKey → VersionHash from final stateAfter\n3. Compare across states: find keys where hashes differ or exist in some states but not others\n\n## Output Struct\n```go\ntype ConvergedStateDiff struct {\n NumStates int\n DifferingObjects []ObjectDiff\n IdenticalObjects []ObjectKey // optional\n}\n\ntype ObjectDiff struct {\n Key ObjectKey\n ByState map[string]VersionHash // stateID → hash (empty if missing)\n}\n```\n\n## Human-readable Output Example\n```\n2 converged states with 1 differing object:\n\n Endpoints/kamera-test/kamera-test:\n State pdu8xy8e: hash=abc123 (empty subsets)\n State 15wjkorq: hash=def456 (has pod 10.0.0.1)\n```\n\n## Location\npkg/analysis/diff.go","status":"closed","priority":2,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:34.237069-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T15:58:30.286127-08:00","closed_at":"2026-01-14T15:58:30.286127-08:00","close_reason":"Implemented in commits 6fae4a5 through a5a4a0a","dependencies":[{"issue_id":"kamera-pwm","depends_on_id":"kamera-189","type":"blocks","created_at":"2026-01-14T14:08:15.316316-08:00","created_by":"Tim Goodwin"}]} From 761e7721f2aaf340b451d8257eec0657f0558294 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Sun, 22 Feb 2026 15:39:01 -0800 Subject: [PATCH 05/29] check in design doc --- ...6-02-21-user-controller-workflow-design.md | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 docs/plans/2026-02-21-user-controller-workflow-design.md diff --git a/docs/plans/2026-02-21-user-controller-workflow-design.md b/docs/plans/2026-02-21-user-controller-workflow-design.md new file mode 100644 index 0000000..d30c164 --- /dev/null +++ b/docs/plans/2026-02-21-user-controller-workflow-design.md @@ -0,0 +1,109 @@ +# User Controller Workflow Design + +## Objective +Promote multi-step user workflows to a first-class concept in Kamera by modeling +the user as an in-engine actor, integrated directly into `pkg/tracecheck/explore.go`, +so user actions can interleave with controller reconciles during exploration. + +## Context +Current workflow handling is scenario/phase oriented at runner level and does not +provide a direct mechanism for injecting user actions at arbitrary points during +an unfolding execution path. + +This design narrows to one user actor per exploration instance and keeps existing +controller reconcile semantics intact. + +## Core Design Decisions +1. Single user actor: each `Explorer` has exactly one `UserController`. +2. The user actor is conceptually "another controller", but executed via a + dedicated user-action path in `tracecheck` (not as a normal pending + reconciler entry). +3. User actions are ordered and stateful across a branch via + `StateNode.nextUserActionIdx`. +4. Action scheduling is abstracted behind: + `shouldApplyNextUserAction(state StateNode) bool`. +5. Initial scheduler policy is quiescence-only. +6. Assumption for v1: every user action mutates state. +7. User action writes must flow through the same replay/effect recording path + used by controller-runtime reconcilers. + +## Data Model +### UserAction +`UserAction` is data-only, intended to support future external workflow files: +- `id` +- `type` +- `payload` + +No per-action function fields are stored on `UserAction`. + +### UserController +One controller object per `Explorer` instance: +- Owns an internal ordered list of `UserAction`. +- Executes the next action for a branch based on `nextUserActionIdx`. +- Returns a normal step result (`Changes`, effects, errors) through the same + effect recording mechanism used by other reconciler paths. + +### Branch Progress Tracking +`StateNode` gets: +- `nextUserActionIdx int` + +This is branch-local progress and advances only when a user action step is +successfully applied on that branch. + +## Explore Loop Integration +Integrate in `pkg/tracecheck/explore.go` main step loop: + +1. Pop state from stack/queue as today. +2. Before terminal convergence classification, evaluate: + `shouldApplyNextUserAction(currentState)`. +3. If true: + - Execute one user action step. + - Apply resulting effects to produce a successor state. + - Determine triggered reconciles. + - Update pending reconciles. + - Increment `nextUserActionIdx`. + - Append synthetic history step with `ControllerID = "UserController"`. + - Enqueue successor and continue. +4. If false, proceed with normal reconcile step selection/execution. +5. A state is terminal/converged only if: + - there is no actionable pending reconcile work, and + - there are no remaining user actions. + +## Scheduling Abstraction +`shouldApplyNextUserAction(state)` is called each explore step. + +Initial internal predicate: +- apply next user action when the branch is quiescent. + +Future policies can plug in behind this method without changing outer loop +structure, enabling finer-grained interleavings like midpoint injections. + +## Invariants and Guardrails +1. Mutating action invariant: + - if a user action produces no effective write/effect, treat as invalid under + v1 assumptions (fail branch or fail run, policy TBD). +2. History visibility: + - user action steps must appear in `ExecutionHistory` and dumps for + explainability. +3. Trigger semantics: + - reconciler triggering after user actions uses existing trigger manager logic. + +## Non-Goals (This Iteration) +- Multi-user concurrency models. +- Generic executor registries or per-action handler maps. +- External workflow file parser implementation. +- Custom midpoint predicate language. + +## Follow-on Implementation Tasks +1. Add user workflow/controller types in `pkg/tracecheck`. +2. Add `nextUserActionIdx` to `StateNode` and clone/copy plumbing. +3. Add `shouldApplyNextUserAction(state)` and quiescence predicate in `Explorer`. +4. Add user action step execution path in `explore.go` using replay/effect path. +5. Update convergence gate to require no remaining user actions. +6. Emit user action history metadata in dumps/inspector context as needed. +7. Add tests for: + - quiescence scheduling behavior, + - user-action-triggered reconcile fanout, + - branch-local index progression, + - mutating-action invariant enforcement. + From 689c638ee1f8970b529e5a15c7c58df2f0b6c024 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 16:31:12 -0800 Subject: [PATCH 06/29] WIP integrating user actions into the simulation flow --- pkg/coverage/inputs.go | 38 +- pkg/coverage/inputs_test.go | 76 +-- pkg/coverage/translate.go | 78 +-- pkg/coverage/translate_test.go | 11 +- pkg/coverage/types.go | 31 +- pkg/explore/parallel_runner.go | 9 +- pkg/explore/parallel_runner_test.go | 46 +- pkg/explore/runner.go | 15 +- pkg/explore/runner_test.go | 2 +- pkg/explore/scenario.go | 11 +- pkg/explore/user_actions.go | 20 + .../async_enqueue_integration_test.go | 2 +- pkg/tracecheck/explore.go | 540 ++++++++++-------- pkg/tracecheck/explore_test.go | 2 +- 14 files changed, 477 insertions(+), 404 deletions(-) create mode 100644 pkg/explore/user_actions.go diff --git a/pkg/coverage/inputs.go b/pkg/coverage/inputs.go index 7630f20..428c162 100644 --- a/pkg/coverage/inputs.go +++ b/pkg/coverage/inputs.go @@ -5,6 +5,8 @@ import ( "fmt" "os" "strings" + + "github.com/tgoodwin/kamera/pkg/event" ) // LoadInputs reads a scenarios inputs file from disk. @@ -40,26 +42,42 @@ func validateInputs(inputs []Input) error { } seenNames[name] = struct{}{} - if len(input.Objects) == 0 { - return fmt.Errorf("input[%d] (%s) must include at least one object", i, name) + if len(input.EnvironmentState.Objects) == 0 && len(input.UserInputs) == 0 { + return fmt.Errorf("input[%d] (%s) must include either environmentState.objects or userInputs", i, name) } - for objIdx, obj := range input.Objects { + + for objIdx, obj := range input.EnvironmentState.Objects { if obj == nil { - return fmt.Errorf("input[%d] (%s) object[%d] is nil", i, name, objIdx) + return fmt.Errorf("input[%d] (%s) environmentState.objects[%d] is nil", i, name, objIdx) } if strings.TrimSpace(obj.GetAPIVersion()) == "" || strings.TrimSpace(obj.GetKind()) == "" { - return fmt.Errorf("input[%d] (%s) object[%d] must set apiVersion and kind", i, name, objIdx) + return fmt.Errorf("input[%d] (%s) environmentState.objects[%d] must set apiVersion and kind", i, name, objIdx) } } - for pendingIdx, pending := range input.Pending { - if strings.TrimSpace(pending.ControllerID) == "" { - return fmt.Errorf("input[%d] (%s) pending[%d].controllerId must be set", i, name, pendingIdx) + for actionIdx, action := range input.UserInputs { + if action.Object == nil { + return fmt.Errorf("input[%d] (%s) userInputs[%d].object must be set", i, name, actionIdx) + } + if strings.TrimSpace(string(action.Type)) == "" { + return fmt.Errorf("input[%d] (%s) userInputs[%d].type must be set", i, name, actionIdx) + } + if !isValidUserActionType(action.Type) { + return fmt.Errorf("input[%d] (%s) userInputs[%d].type must be CREATE, UPDATE, or DELETE", i, name, actionIdx) } - if strings.TrimSpace(pending.Key.Name) == "" { - return fmt.Errorf("input[%d] (%s) pending[%d].key.name must be set", i, name, pendingIdx) + if strings.TrimSpace(action.Object.GetAPIVersion()) == "" || strings.TrimSpace(action.Object.GetKind()) == "" { + return fmt.Errorf("input[%d] (%s) userInputs[%d].object must set apiVersion and kind", i, name, actionIdx) } } } return nil } + +func isValidUserActionType(op event.OperationType) bool { + switch op { + case event.CREATE, event.UPDATE, event.MARK_FOR_DELETION: + return true + default: + return false + } +} diff --git a/pkg/coverage/inputs_test.go b/pkg/coverage/inputs_test.go index a596684..e804570 100644 --- a/pkg/coverage/inputs_test.go +++ b/pkg/coverage/inputs_test.go @@ -13,7 +13,10 @@ import ( ) func TestLoadInputsOK(t *testing.T) { - inputs := []Input{{Name: "case-1", Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}}} + inputs := []Input{{ + Name: "case-1", + EnvironmentState: EnvironmentState{Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}}, + }} path := writeInputsFile(t, inputs) got, err := LoadInputs(path) @@ -23,7 +26,10 @@ func TestLoadInputsOK(t *testing.T) { } func TestLoadInputsTopLevelArrayContract(t *testing.T) { - inputs := []Input{{Name: "case-1", Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}}} + inputs := []Input{{ + Name: "case-1", + EnvironmentState: EnvironmentState{Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}}, + }} path := writeInputsFile(t, inputs) got, err := LoadInputs(path) @@ -46,8 +52,8 @@ func TestLoadInputsEmpty(t *testing.T) { func TestLoadInputsDuplicateScenarioName(t *testing.T) { path := writeInputsFile(t, []Input{ - {Name: "case-1", Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}}, - {Name: "case-1", Objects: []*unstructured.Unstructured{inputObject("v1", "Secret")}}, + {Name: "case-1", EnvironmentState: EnvironmentState{Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}}}, + {Name: "case-1", EnvironmentState: EnvironmentState{Objects: []*unstructured.Unstructured{inputObject("v1", "Secret")}}}, }) _, err := LoadInputs(path) @@ -57,7 +63,10 @@ func TestLoadInputsDuplicateScenarioName(t *testing.T) { func TestLoadInputsMissingScenarioName(t *testing.T) { path := writeInputsFile(t, []Input{ - {Name: " ", Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}}, + { + Name: " ", + EnvironmentState: EnvironmentState{Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}}, + }, }) _, err := LoadInputs(path) @@ -66,65 +75,37 @@ func TestLoadInputsMissingScenarioName(t *testing.T) { } func TestLoadInputsMissingObjects(t *testing.T) { - path := writeInputsFile(t, []Input{ - {Name: "case-1"}, - }) + path := writeInputsFile(t, []Input{{Name: "case-1"}}) _, err := LoadInputs(path) require.Error(t, err) - assert.Contains(t, err.Error(), "must include at least one object") + assert.Contains(t, err.Error(), "must include either environmentState.objects or userInputs") } -func TestLoadInputsInvalidPendingKey(t *testing.T) { +func TestLoadInputsInvalidUserInputsType(t *testing.T) { path := writeInputsFile(t, []Input{ { - Name: "case-1", - Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}, - Pending: []Pending{ - { - ControllerID: "controller-a", - Key: NamespacedName{ - Namespace: "default", - Name: " ", - }, - }, - }, + Name: "case-1", + EnvironmentState: EnvironmentState{Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}}, + UserInputs: []UserInput{{ + Type: "NOPE", + Object: inputObject("v1", "ConfigMap"), + }}, }, }) _, err := LoadInputs(path) require.Error(t, err) - assert.Contains(t, err.Error(), "pending[0].key.name must be set") + assert.Contains(t, err.Error(), "userInputs[0].type must be CREATE, UPDATE, or DELETE") } func TestLoadInputsInvalidObjectGVK(t *testing.T) { path := writeInputsFile(t, []Input{ { Name: "case-1", - Objects: []*unstructured.Unstructured{ - {Object: map[string]any{"kind": "ConfigMap"}}, - }, - }, - }) - - _, err := LoadInputs(path) - require.Error(t, err) - assert.Contains(t, err.Error(), "object[0]") - assert.True(t, strings.Contains(err.Error(), "apiVersion") || strings.Contains(err.Error(), "kind")) -} - -func TestLoadInputsInvalidPendingControllerID(t *testing.T) { - path := writeInputsFile(t, []Input{ - { - Name: "case-1", - Objects: []*unstructured.Unstructured{inputObject("v1", "ConfigMap")}, - Pending: []Pending{ - { - ControllerID: " ", - Key: NamespacedName{ - Namespace: "default", - Name: "obj", - }, + EnvironmentState: EnvironmentState{ + Objects: []*unstructured.Unstructured{ + {Object: map[string]any{"kind": "ConfigMap"}}, }, }, }, @@ -132,7 +113,8 @@ func TestLoadInputsInvalidPendingControllerID(t *testing.T) { _, err := LoadInputs(path) require.Error(t, err) - assert.Contains(t, err.Error(), "pending[0].controllerId must be set") + assert.Contains(t, err.Error(), "environmentState.objects[0]") + assert.True(t, strings.Contains(err.Error(), "apiVersion") || strings.Contains(err.Error(), "kind")) } func writeInputsFile(t *testing.T, inputs []Input) string { diff --git a/pkg/coverage/translate.go b/pkg/coverage/translate.go index 01a879e..02ff160 100644 --- a/pkg/coverage/translate.go +++ b/pkg/coverage/translate.go @@ -2,11 +2,11 @@ package coverage import ( "fmt" - "log" "sort" "strings" "github.com/tgoodwin/kamera/pkg/analyze" + "github.com/tgoodwin/kamera/pkg/event" "github.com/tgoodwin/kamera/pkg/util" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) @@ -28,8 +28,7 @@ func TranslateHotspots(graph *analyze.Graph, hotspots []analyze.HotspotInstance, resourceIDs := collectResourceIDs(hotspot) resourceList := sortedNodeIDs(resourceIDs) - objects := make([]*unstructured.Unstructured, 0, len(resourceList)) - gvkToObject := make(map[string]*unstructured.Unstructured) + userInputs := make([]UserInput, 0, len(resourceList)) for _, nodeID := range resourceList { node, ok := graph.Nodes[nodeID] @@ -56,21 +55,18 @@ func TranslateHotspots(graph *analyze.Graph, hotspots []analyze.HotspotInstance, name := normalizedName(hotspot.Type, i, node.Resource.Kind) ns := namespaceForTemplate(templates[0].Object) obj := NormalizeTemplate(templates[0].Object, name, ns) - gvkToObject[gvk] = obj - objects = append(objects, obj) - } - - pending, err := buildPending(graph, hotspot.Controllers, gvkToObject) - if err != nil { - return nil, err + userInputs = append(userInputs, UserInput{ + ID: fmt.Sprintf("user-input-%d", len(userInputs)), + Type: event.CREATE, + Object: obj, + }) } tuning := buildTuning(hotspot, graph) input := Input{ - Name: fmt.Sprintf("hotspot-%s-%d", hotspot.Type, i), - Objects: objects, - Pending: pending, - Tuning: tuning, + Name: fmt.Sprintf("hotspot-%s-%d", hotspot.Type, i), + UserInputs: userInputs, + Tuning: tuning, } out = append(out, input) } @@ -139,60 +135,6 @@ func namespaceForTemplate(obj *unstructured.Unstructured) string { return defaultNamespace } -func buildPending(graph *analyze.Graph, controllers []analyze.NodeID, gvkToObject map[string]*unstructured.Unstructured) ([]Pending, error) { - pending := make([]Pending, 0, len(controllers)) - for _, controllerID := range controllers { - node, ok := graph.Nodes[controllerID] - if !ok { - return nil, fmt.Errorf("controller node %s not found in graph", controllerID) - } - if node.Kind != analyze.NodeController { - return nil, fmt.Errorf("node %s is not a controller", controllerID) - } - - targets := reconcilesTargets(graph, controllerID) - if len(targets) == 0 { - return nil, fmt.Errorf("controller %s has no reconciles target", node.Controller.Name) - } - if len(targets) > 1 { - log.Printf("warning: controller %s reconciles multiple kinds; using %s", node.Controller.Name, targets[0]) - } - gvk := targets[0] - obj, ok := gvkToObject[gvk] - if !ok || obj == nil { - return nil, fmt.Errorf("no object for reconciles target %s", gvk) - } - pending = append(pending, Pending{ - ControllerID: node.Controller.Name, - Key: NamespacedName{ - Namespace: obj.GetNamespace(), - Name: obj.GetName(), - }, - }) - } - return pending, nil -} - -func reconcilesTargets(graph *analyze.Graph, controllerID analyze.NodeID) []string { - set := make(map[string]struct{}) - for _, edge := range graph.Edges { - if edge.Kind != analyze.EdgeReconciles || edge.From != controllerID { - continue - } - node, ok := graph.Nodes[edge.To] - if !ok || node.Kind != analyze.NodeResource { - continue - } - set[gvkString(node.Resource)] = struct{}{} - } - out := make([]string, 0, len(set)) - for gvk := range set { - out = append(out, gvk) - } - sort.Strings(out) - return out -} - func buildTuning(hotspot analyze.HotspotInstance, graph *analyze.Graph) InputTuning { tuning := InputTuning{ StaleReads: make(map[string][]string), diff --git a/pkg/coverage/translate_test.go b/pkg/coverage/translate_test.go index 99966a8..74e25a1 100644 --- a/pkg/coverage/translate_test.go +++ b/pkg/coverage/translate_test.go @@ -59,8 +59,10 @@ func TestTranslateHotspots(t *testing.T) { require.Len(t, inputs, len(multi)) input := inputs[0] - require.Len(t, input.Objects, 1) - obj := input.Objects[0] + require.Len(t, input.UserInputs, 1) + inputAction := input.UserInputs[0] + require.Equal(t, "CREATE", string(inputAction.Type)) + obj := inputAction.Object require.Equal(t, "hs-multi_writer-0-service", obj.GetName()) require.Equal(t, "default", obj.GetNamespace()) @@ -69,9 +71,4 @@ func TestTranslateHotspots(t *testing.T) { require.False(t, found) require.ElementsMatch(t, []string{"WriterA", "WriterB"}, input.Tuning.PermuteControllers) - require.Len(t, input.Pending, 2) - for _, pending := range input.Pending { - require.Equal(t, obj.GetName(), pending.Key.Name) - require.Equal(t, obj.GetNamespace(), pending.Key.Namespace) - } } diff --git a/pkg/coverage/types.go b/pkg/coverage/types.go index 8a07c21..5f9ee26 100644 --- a/pkg/coverage/types.go +++ b/pkg/coverage/types.go @@ -1,25 +1,28 @@ package coverage -import "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +import ( + "github.com/tgoodwin/kamera/pkg/event" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) -// Input captures the concrete objects and pending reconciles for a single scenario seed. +// Input captures the starting cluster state and declarative inputs for a scenario. type Input struct { - Name string `json:"name"` - Objects []*unstructured.Unstructured `json:"objects"` - Pending []Pending `json:"pending"` - Tuning InputTuning `json:"tuning"` + Name string `json:"name"` + EnvironmentState EnvironmentState `json:"environmentState"` + UserInputs []UserInput `json:"userInputs"` + Tuning InputTuning `json:"tuning"` } -// Pending is a controller + namespaced key pair to enqueue for reconciliation. -type Pending struct { - ControllerID string `json:"controllerId"` - Key NamespacedName `json:"key"` +// EnvironmentState captures baseline objects present before user actions. +type EnvironmentState struct { + Objects []*unstructured.Unstructured `json:"objects"` } -// NamespacedName is a minimal namespaced object identity. -type NamespacedName struct { - Namespace string `json:"namespace"` - Name string `json:"name"` +// UserInput models a declarative state change performed by the user +type UserInput struct { + ID string `json:"id"` + Type event.OperationType `json:"type"` + Object *unstructured.Unstructured `json:"object"` } // InputTuning carries compact hints for later ExploreConfig construction. diff --git a/pkg/explore/parallel_runner.go b/pkg/explore/parallel_runner.go index 5ed59c5..b3fffea 100644 --- a/pkg/explore/parallel_runner.go +++ b/pkg/explore/parallel_runner.go @@ -82,6 +82,8 @@ func (r *ParallelRunner) RunAll(ctx context.Context, scenarios []Scenario, opts if ParallelChildIndex() >= 0 && !ParallelProcessesEnabled() { return nil, fmt.Errorf("--parallel-child-index requires --parallel-processes") } + // the simclock package is not thread-safe, so for use cases that rely on simclock, we support + // a process-isolation mode that launches separate executions for each scenario and aggregates results via disk. if ParallelProcessesEnabled() { if strings.TrimSpace(InputsPath()) == "" { return nil, fmt.Errorf("--parallel-processes requires explicit --inputs ") @@ -94,6 +96,8 @@ func (r *ParallelRunner) RunAll(ctx context.Context, scenarios []Scenario, opts return r.runInProcess(ctx, scenarios, opts) } +// runSupervisorMode launches child processes for each scenario and aggregates results. +// it expects the child processes to write results to disk and does not enforce any ordering guarantees on completion. func (r *ParallelRunner) runSupervisorMode(ctx context.Context, scenarios []Scenario, opts ParallelOptions) ([]ScenarioResult, error) { if err := ensureParallelOutputDirs(opts); err != nil { return nil, err @@ -211,6 +215,7 @@ func (r *ParallelRunner) runSupervisorMode(ctx context.Context, scenarios []Scen return results, nil } +// runChildMode executes a single scenario based on the child index func (r *ParallelRunner) runChildMode(ctx context.Context, scenarios []Scenario, opts ParallelOptions) ([]ScenarioResult, error) { opts = childParallelOptions(opts) if err := ensureParallelOutputDirs(opts); err != nil { @@ -262,6 +267,7 @@ func (r *ParallelRunner) runChildMode(ctx context.Context, scenarios []Scenario, return []ScenarioResult{result}, nil } +// runInProcess executes all scenarios concurrently within the same process using goroutines. func (r *ParallelRunner) runInProcess(ctx context.Context, scenarios []Scenario, opts ParallelOptions) ([]ScenarioResult, error) { if err := ensureParallelOutputDirs(opts); err != nil { return nil, err @@ -318,7 +324,7 @@ func (r *ParallelRunner) runScenario(ctx context.Context, scenario Scenario, opt return result } - seed, err := r.builder.BuildRestartSeed(scenario.InitialState) + seed, err := r.builder.BuildRestartSeed(scenario.EnvironmentState) if err != nil { result.Err = fmt.Errorf("build restart seed: %w", err) return result @@ -407,6 +413,7 @@ func (r *ParallelRunner) runScenarioPhase( phase.Err = fmt.Errorf("fork builder: nil") return phase } + fork.WithUserActions(cloneUserActions(scenario.UserInputs)) fork.SetConfig(cfg) if len(prefix) > 0 && prefixResolver != nil { if err := fork.PrimeVersionStoreFromHistory(prefix, prefixResolver); err != nil { diff --git a/pkg/explore/parallel_runner_test.go b/pkg/explore/parallel_runner_test.go index f00a20e..2753b06 100644 --- a/pkg/explore/parallel_runner_test.go +++ b/pkg/explore/parallel_runner_test.go @@ -60,12 +60,12 @@ func TestParallelRunnerDoesNotLeakConfig(t *testing.T) { scenarios := []Scenario{ { Name: "max-depth-low", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}, }, { Name: "max-depth-normal", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 5}, }, } @@ -108,7 +108,7 @@ func TestParallelRunnerWritesDump(t *testing.T) { scenarios := []Scenario{ { Name: "Foo Scenario", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 5}, Context: ScenarioContext{ Workflow: "smoke-workflow", @@ -183,7 +183,7 @@ func TestParallelRunnerCapturesInvariantError(t *testing.T) { scenarios := []Scenario{ { Name: "invariant-fails", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 5}, Invariant: func(tracecheck.StateNode) error { return errors.New("invariant failed") @@ -216,7 +216,7 @@ func TestParallelRunnerClosedLoopRunsReferenceThenRerunPerScenario(t *testing.T) scenarios := []Scenario{ { Name: "closed-loop", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}, ClosedLoop: &ClosedLoopSpec{ Plan: func(reference ScenarioPhaseResult) ([]ScenarioPhasePlan, error) { @@ -272,7 +272,7 @@ func TestParallelRunnerClosedLoopWritesPhaseDumps(t *testing.T) { scenarios := []Scenario{ { Name: "closed-loop-dump", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}, ClosedLoop: &ClosedLoopSpec{ Plan: func(reference ScenarioPhaseResult) ([]ScenarioPhasePlan, error) { @@ -317,7 +317,7 @@ func TestParallelRunnerClosedLoopPrefixHistoryHashesAreDumpable(t *testing.T) { scenarios := []Scenario{ { Name: "closed-loop-prefix-hashes", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 5}, ClosedLoop: &ClosedLoopSpec{ Plan: func(reference ScenarioPhaseResult) ([]ScenarioPhasePlan, error) { @@ -417,7 +417,7 @@ func TestParallelRunnerProcessModeRequiresInputsFile(t *testing.T) { scenarios := []Scenario{ { Name: "x", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 3}, }, } @@ -442,17 +442,17 @@ func TestParallelRunnerChildModeFailsWhenSelectedInputMapsToMultipleScenarios(t scenarios := []Scenario{ { Name: "alpha/base", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 3}, }, { Name: "alpha/single/foo", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 3}, }, { Name: "beta", - InitialState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 3}, }, } @@ -536,9 +536,9 @@ func TestParallelRunnerSupervisorRunsAllChildrenAndAggregatesFailures(t *testing } scenarios := []Scenario{ - {Name: "a", InitialState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}}, - {Name: "b", InitialState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}}, - {Name: "c", InitialState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}}, + {Name: "a", EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}}, + {Name: "b", EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}}, + {Name: "c", EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}}, } results, runErr := runner.RunAll(context.Background(), scenarios, ParallelOptions{MaxParallel: 2}) @@ -591,14 +591,16 @@ func writeInputNamesFile(t *testing.T, names ...string) string { for _, name := range names { inputs = append(inputs, coverage.Input{ Name: name, - Objects: []*unstructured.Unstructured{ - { - Object: map[string]interface{}{ - "apiVersion": "v1", - "kind": "ConfigMap", - "metadata": map[string]interface{}{ - "name": name + "-cm", - "namespace": "default", + EnvironmentState: coverage.EnvironmentState{ + Objects: []*unstructured.Unstructured{ + { + Object: map[string]interface{}{ + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": map[string]interface{}{ + "name": name + "-cm", + "namespace": "default", + }, }, }, }, diff --git a/pkg/explore/runner.go b/pkg/explore/runner.go index 2cec050..c3f6ac7 100644 --- a/pkg/explore/runner.go +++ b/pkg/explore/runner.go @@ -66,6 +66,15 @@ type Runner struct { builder *tracecheck.ExplorerBuilder } +// RunInput defines a single runner invocation. +// +// EnvironmentState is the user-provided baseline state. +// UserActions represent declarative workload actions applied at boundaries during exploration. +type RunInput struct { + EnvironmentState tracecheck.StateNode + UserActions []tracecheck.UserAction +} + // NewRunner constructs a Runner from a configured ExplorerBuilder. func NewRunner(builder *tracecheck.ExplorerBuilder) (*Runner, error) { if builder == nil { @@ -77,13 +86,14 @@ func NewRunner(builder *tracecheck.ExplorerBuilder) (*Runner, error) { } // Run executes the initial exploration and, if interactive is enabled, loops handling restart requests. -func (r *Runner) Run(ctx context.Context, initialState tracecheck.StateNode) error { +func (r *Runner) Run(ctx context.Context, input RunInput) error { if r == nil || r.builder == nil { return fmt.Errorf("explore runner: builder is required") } currentConfig := r.builder.Config() - baseline := initialState.Clone() + baseline := input.EnvironmentState.Clone() + userActions := cloneUserActions(input.UserActions) mergeStates := func(existing, additions []tracecheck.ResultState) []tracecheck.ResultState { out := make([]tracecheck.ResultState, 0, len(existing)+len(additions)) @@ -108,6 +118,7 @@ func (r *Runner) Run(ctx context.Context, initialState tracecheck.StateNode) err runOnce := func(ctx context.Context, state tracecheck.StateNode) (*tracecheck.Result, tracecheck.VersionManager, *tracecheck.ExploreStats, error) { r.builder.SetConfig(currentConfig) + r.builder.WithUserActions(cloneUserActions(userActions)) // get a fresh explorer for each run explorer, err := r.builder.Build("standalone") if err != nil { diff --git a/pkg/explore/runner_test.go b/pkg/explore/runner_test.go index 479b5c5..21112c0 100644 --- a/pkg/explore/runner_test.go +++ b/pkg/explore/runner_test.go @@ -32,7 +32,7 @@ func TestRunnerWritesDumpContext(t *testing.T) { t.Fatalf("new runner: %v", err) } - if err := runner.Run(context.Background(), state); err != nil { + if err := runner.Run(context.Background(), RunInput{EnvironmentState: state}); err != nil { t.Fatalf("run runner: %v", err) } diff --git a/pkg/explore/scenario.go b/pkg/explore/scenario.go index cb12636..0f77ec6 100644 --- a/pkg/explore/scenario.go +++ b/pkg/explore/scenario.go @@ -10,11 +10,12 @@ type ScenarioContext struct { // Scenario is the unit produced by input generators and consumed by runners. type Scenario struct { - Name string - InitialState tracecheck.StateNode - Config tracecheck.ExploreConfig - Invariant func(tracecheck.StateNode) error - Context ScenarioContext + Name string + EnvironmentState tracecheck.StateNode + UserInputs []tracecheck.UserAction + Config tracecheck.ExploreConfig + Invariant func(tracecheck.StateNode) error + Context ScenarioContext // ClosedLoop enables per-scenario reference->analyze->rerun pipelines. // The scenario's Config is used for the initial reference phase. ClosedLoop *ClosedLoopSpec diff --git a/pkg/explore/user_actions.go b/pkg/explore/user_actions.go new file mode 100644 index 0000000..6cf0bee --- /dev/null +++ b/pkg/explore/user_actions.go @@ -0,0 +1,20 @@ +package explore + +import "github.com/tgoodwin/kamera/pkg/tracecheck" + +func cloneUserActions(actions []tracecheck.UserAction) []tracecheck.UserAction { + if len(actions) == 0 { + return nil + } + + cloned := make([]tracecheck.UserAction, 0, len(actions)) + for _, action := range actions { + copyAction := action + if action.Payload != nil { + copyAction.Payload = action.Payload.DeepCopyObject() + } + cloned = append(cloned, copyAction) + } + + return cloned +} diff --git a/pkg/tracecheck/async_enqueue_integration_test.go b/pkg/tracecheck/async_enqueue_integration_test.go index 07a6710..05b0ccb 100644 --- a/pkg/tracecheck/async_enqueue_integration_test.go +++ b/pkg/tracecheck/async_enqueue_integration_test.go @@ -197,7 +197,7 @@ func runReconcileStep(t *testing.T, explorer *Explorer, state StateNode, step in nextState, nextSequences, nextEvents := explorer.applyEffects(stepLogger, state, reconcileResult) - newPending := explorer.determineNewPendingReconciles(ctx, state, pr, reconcileResult) + newPending := explorer.determineNewPendingReconciles(ctx, state, &pr, reconcileResult) reconcileResult.StateAfter = nextState reconcileResult.KindSeqAfter = nextSequences diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go index da01ece..a6a2c11 100644 --- a/pkg/tracecheck/explore.go +++ b/pkg/tracecheck/explore.go @@ -752,6 +752,53 @@ func (e *Explorer) explore( "NextUserActionIdx", currentState.nextUserActionIdx, ).Info("user action scheduler selected next action") } + if shouldApplyUserAction { + stateView := currentState + stepLogger := logger.WithValues( + "Depth", stateView.depth, + "# Distinct States", e.stats.UniqueNodeVisits, + "Total States", e.stats.TotalNodeVisits, + "Resource States", e.stats.UniqueResourceStates, + ) + stepCtx := log.IntoContext(ctx, stepLogger) + + stepLogger.WithValues("ActionIdx", stateView.nextUserActionIdx).Info("Taking user action step") + stepResult, err := e.takeUserActionStep(stepCtx, stateView) + if err != nil { + stepLogger.Error(err, "error taking user action step; abandoning branch") + failurePath := stateView.ExecutionHistory + if stepResult != nil { + failurePath = append(slices.Clone(stateView.ExecutionHistory), stepResult) + } + if e.emitAbortedState(ctx, abortedStatesCh, stateView, executionPathsToState, failurePath, err) { + return nil + } + continue + } + if stepResult == nil { + stepLogger.Error(nil, "user action step returned nil result") + if e.emitAbortedState(ctx, abortedStatesCh, stateView, executionPathsToState, stateView.ExecutionHistory, errors.New("nil user action result")) { + return nil + } + continue + } + newState, triggeredByStep := e.materializeNextState(stepLogger, stepCtx, stateView, stepResult, nil, stateView.nextUserActionIdx+1) + newState.depth = stateView.depth + 1 + if _, seenDepth := seenDepths[newState.depth]; !seenDepth { + seenDepths[newState.depth] = true + } + + stack = e.enqueueNextStates( + stack, + newState, + triggeredByStep, + subtreeTracker, + useSubtreeCompletion, + logOrderingPrune, + orderPruneUseOrderHash, + ) + continue + } // A state is considered converged if: // 1. There are no pending reconciles, OR @@ -840,7 +887,7 @@ func (e *Explorer) explore( } } - // Determine stateView and pendingReconcile based on whether this is a stale view entry + // Determine stateView and pendingReconcile for this reconcile step. var stateView StateNode var pendingReconcile PendingReconcile @@ -918,15 +965,9 @@ func (e *Explorer) explore( stateView = possibleViews[0] } - // Process single stateView (no longer inside a for loop) + // Process single stateView reconcilerID := pendingReconcile.ReconcilerID { - if logger.V(2).Enabled() { - logger.V(2).WithValues("Reconciler", reconcilerID, "StateKey", stateView.Hash(), "OrderKey", stateView.OrderHash(), "Request", pendingReconcile.Request).Info("BEFORE") - stateView.Contents.DumpContents() - stateView.DumpPending() - } - stepLogger := logger.WithValues( "Depth", stateView.depth, "# Distinct States", e.stats.UniqueNodeVisits, @@ -964,42 +1005,14 @@ func (e *Explorer) explore( } } - stepResult.StateBefore = maps.Clone(stateView.Objects()) - stepResult.KindSeqBefore = maps.Clone(stateView.Contents.KindSequences) - - // update the state with the new object versions. - // note that we are updating the "global state" here, - // which may be separate from what the controller saw upon reconciling. - newContents, newSequences, newStateEvents := e.applyEffects(stepLogger, stateView, stepResult) - - triggeredByStep := e.getTriggeredReconcilers(stepResult.Changes) - - newPendingReconciles := e.determineNewPendingReconciles(ctx, stateView, pendingReconcile, stepResult) - stepLogger.V(1).WithValues( - "Depth", stateView.depth, - "Count", len(newPendingReconciles), - "Items", newPendingReconciles, - ).Info("final pending reconciles after step") - - // make a copy of the current execution history - currHistory := slices.Clone(stateView.ExecutionHistory) - - stepResult.StateAfter = newContents - stepResult.KindSeqAfter = newSequences - stepResult.PendingReconciles = newPendingReconciles - - newState := StateNode{ - Contents: NewStateSnapshot(newContents, newSequences, newStateEvents), - PendingReconciles: newPendingReconciles, - parent: &stateView, - action: stepResult, - // inherit divergence point from the parent - divergenceKey: stateView.divergenceKey, - stuckReconcilerPositions: maps.Clone(stateView.stuckReconcilerPositions), - ExecutionHistory: append(currHistory, stepResult), - nextUserActionIdx: stateView.nextUserActionIdx, - } - newState.ID = string(newState.Hash()) + newState, triggeredByStep := e.materializeNextState( + stepLogger, + stepCtx, + stateView, + stepResult, + &pendingReconcile, + stateView.nextUserActionIdx, + ) // Track whether this was a no-op (used by ordering optimization) wasNoOp := err == nil && stepResult.wasNoOp() @@ -1031,11 +1044,6 @@ func (e *Explorer) explore( continue } logger.V(1).WithValues("Depth", currentState.depth, "NewPendingReconciles", newState.PendingReconciles).Info("reconcile step completed") - if logger.V(2).Enabled() { - logger.V(2).WithValues("Reconciler", reconcilerID, "StateKey", newState.Hash(), "Request", pendingReconcile.Request).Info("AFTER") - newState.Contents.DumpContents() - newState.DumpPending() - } newState.depth = currentState.depth + 1 if _, seenDepth := seenDepths[newState.depth]; !seenDepth { @@ -1056,137 +1064,15 @@ func (e *Explorer) explore( continue } - // Deduplication: Skip exploring paths that reach the same state via equivalent mutations. - ///Skipped - // Key invariant: Same pending list = Same future possibilities = Safe to skip. - // - // ContentsHash includes both object state AND pending reconciles. Two paths only - // match when they have identical pending lists. If the pending lists are identical, - // then the future exploration from both paths would be identical - same controllers - // to run, same state to observe - so exploring both would be redundant. - // - // Importantly, by the time we reach this check, we've already queued all ordering - // variants for the pending list (via expandStateByReconcileOrder at lines 388-410). - // Skipping here doesn't mean "we don't care about orderings" - it means "we've - // already scheduled those orderings to be explored, no need to schedule them again." - // - // At intermediate states, different orderings naturally yield different pending - // lists because whichever reconcile just ran gets removed: - // - // Path A: ...→ Foo@1 → State X, Pending=[Bar] (Foo removed) - // Path B: ...→ Bar@1 → State X, Pending=[Foo] (Bar removed) - // - // Different pending lists → different ContentsHashes → both fully explored. - // - // Pruning typically only occurs at convergence (Pending=[]) where all paths - // collapse to empty pending lists. The paths that get pruned differ only in - // no-op orderings, which by definition cannot produce different outcomes. - - ContentsHash := newState.Hash() - // Deduplication based on completion status: - // Only skip if we've COMPLETED exploration of this (state, history) pair. - // This prevents skipping paths that are still in-flight and might fail, - // which would cause us to miss valid convergence paths. - normalizedHistory := newState.ExecutionHistory.UniqueKey() - if e.optimizations != nil && e.optimizations.pathCompleted(ContentsHash, normalizedHistory) { - logger.V(1).WithValues( - "ContentsHash", ContentsHash, - "PathSignature", normalizedHistory, - ).Info("skipping - path already completed exploration") - e.stats.SkippedPaths++ - continue - } - - if e.optimizations != nil { - e.optimizations.markVisited(ContentsHash, normalizedHistory) - e.optimizations.markLogicalState(newState.ContentsHash(), newState.PendingReconciles, normalizedHistory, newState.stuckPositionsSignature()) - } - - // Also track in exploredLogicalStates for cache prediction - - // branch on order of subsequent reconciles that were triggered by this state change step - // Use enqueueWithMarker to track subtree completion for all ordering variants together. - branchStateKey := newState.Hash() - if orderPruneUseOrderHash { - branchStateKey = NodeHash(newState.OrderHash()) - } - var statesToEnqueue []StateNode - - if len(newState.PendingReconciles) > 1 { - // When ordering pruning is enabled, we only expand once per logical state. - alreadyExpanded := e.optimizations != nil && e.optimizations.branchAlreadyExpanded(branchStateKey, triggeredByStep, e.Config.Perturbations.PermuteOrder) - if !alreadyExpanded { - expandedStates := e.expandStateByReconcileOrder(newState, triggeredByStep) - - // Diagnostic logging for non-determinism investigation: - // Log the ordering variants generated to detect if different runs produce different variant orders. - if logger.V(2).Enabled() { - if len(expandedStates) > 0 { - variantFirstReconcilers := lo.Map(expandedStates, func(s StateNode, _ int) string { - if len(s.PendingReconciles) > 0 { - pr := s.PendingReconciles[0] - return fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) - } - return "empty" - }) - pendingBefore := lo.Map(newState.PendingReconciles, func(pr PendingReconcile, _ int) string { - return fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) - }) - logger.V(2).Info("ORDERING_VARIANTS_DIAGNOSTIC", - "depth", newState.depth, - "numVariants", len(expandedStates), - "pendingBefore", pendingBefore, - "variantFirstReconcilers", variantFirstReconcilers, - ) - } - } - - for _, orderVariant := range expandedStates { - // skip orderVariants whose first reconcile are known no-ops - if e.optimizations != nil && e.optimizations.noOpOrderingSkipEnabled() { - fst := orderVariant.PendingReconciles[0] - noOpKey := fmt.Sprintf("%s:%s:%s", orderVariant.ContentsHash(), fst.ReconcilerID, fst.Request.NamespacedName.String()) - if isNoOp, known := e.optimizations.isKnownNoOp(noOpKey); known && isNoOp { - e.stats.SkippedNoOpOrderings++ - continue - } - } - statesToEnqueue = append(statesToEnqueue, orderVariant) - } - if e.optimizations != nil { - e.optimizations.markBranchExpanded(branchStateKey, triggeredByStep, e.Config.Perturbations.PermuteOrder) - } - } else if e.optimizations != nil { - if logOrderingPrune { - pendingIDs := make([]string, len(newState.PendingReconciles)) - for i, pr := range newState.PendingReconciles { - pendingIDs[i] = fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) - } - triggeredIDs := make([]string, len(triggeredByStep)) - for i, pr := range triggeredByStep { - triggeredIDs[i] = fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) - } - logger.Info("ordering pruning: skip expansion for already-expanded state", - "depth", newState.depth, - "branchKey", branchStateKey, - "nodeHash", newState.Hash(), - "orderHash", newState.OrderHash(), - "orderSensitiveKey", orderPruneUseOrderHash, - "contentsHash", newState.ContentsHash(), - "permuteSignature", e.optimizations.permuteSignature(triggeredByStep, e.Config.Perturbations.PermuteOrder), - "pending", pendingIDs, - "triggeredByStep", triggeredIDs, - ) - } - e.stats.SkippedOrderExpansions++ - } - } - - // Always include the base state - statesToEnqueue = append(statesToEnqueue, newState) - - // Enqueue all variants together with a single marker for the logical state - stack, _ = e.enqueueStates(stack, subtreeTracker, statesToEnqueue, useSubtreeCompletion) + stack = e.enqueueNextStates( + stack, + newState, + triggeredByStep, + subtreeTracker, + useSubtreeCompletion, + logOrderingPrune, + orderPruneUseOrderHash, + ) } } @@ -1204,6 +1090,165 @@ func (e *Explorer) explore( return nil } +// enqueueNextStates handles post-step deduplication and pending-order expansion. +func (e *Explorer) enqueueNextStates( + stack []stackEntry, + newState StateNode, + triggeredByStep []PendingReconcile, + subtreeTracker *subtreeTracker, + useSubtreeCompletion bool, + logOrderingPrune bool, + orderPruneUseOrderHash bool, +) []stackEntry { + // Deduplication: skip only when this (state, history) has already completed. + // + // Key invariant: same pending list means same future possibilities. + // ContentsHash includes object state + pending reconciles, so two paths only + // collide when they have identical pending lists, which implies identical future + // exploration from that point. + // + // This is safe with ordering expansion because we queue ordering variants before + // this dedup would hide anything. At intermediate states, different run orders + // naturally produce different pending lists (the consumed reconcile differs), so + // they remain distinct and continue to be explored. + contentsHash := newState.Hash() + normalizedHistory := newState.ExecutionHistory.UniqueKey() + if e.optimizations != nil && e.optimizations.pathCompleted(contentsHash, normalizedHistory) { + logger.V(1).WithValues( + "ContentsHash", contentsHash, + "PathSignature", normalizedHistory, + ).Info("skipping - path already completed exploration") + e.stats.SkippedPaths++ + return stack + } + + if e.optimizations != nil { + e.optimizations.markVisited(contentsHash, normalizedHistory) + e.optimizations.markLogicalState( + newState.ContentsHash(), + newState.PendingReconciles, + normalizedHistory, + newState.stuckPositionsSignature(), + ) + } + + branchStateKey := newState.Hash() + if orderPruneUseOrderHash { + branchStateKey = NodeHash(newState.OrderHash()) + } + statesToEnqueue := make([]StateNode, 0, 1) + + if len(newState.PendingReconciles) > 1 { + // Order expansion is per logical state branch. If already expanded, we skip + // duplicative expansion work for equivalent branches. + alreadyExpanded := e.optimizations != nil && + e.optimizations.branchAlreadyExpanded(branchStateKey, triggeredByStep, e.Config.Perturbations.PermuteOrder) + if !alreadyExpanded { + expandedStates := e.expandStateByReconcileOrder(newState, triggeredByStep) + + if logger.V(2).Enabled() && len(expandedStates) > 0 { + variantFirstReconcilers := lo.Map(expandedStates, func(s StateNode, _ int) string { + if len(s.PendingReconciles) > 0 { + pr := s.PendingReconciles[0] + return fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) + } + return "empty" + }) + pendingBefore := lo.Map(newState.PendingReconciles, func(pr PendingReconcile, _ int) string { + return fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) + }) + logger.V(2).Info("ORDERING_VARIANTS_DIAGNOSTIC", + "depth", newState.depth, + "numVariants", len(expandedStates), + "pendingBefore", pendingBefore, + "variantFirstReconcilers", variantFirstReconcilers, + ) + } + + for _, orderVariant := range expandedStates { + if e.optimizations != nil && e.optimizations.noOpOrderingSkipEnabled() { + fst := orderVariant.PendingReconciles[0] + noOpKey := fmt.Sprintf("%s:%s:%s", orderVariant.ContentsHash(), fst.ReconcilerID, fst.Request.NamespacedName.String()) + if isNoOp, known := e.optimizations.isKnownNoOp(noOpKey); known && isNoOp { + e.stats.SkippedNoOpOrderings++ + continue + } + } + statesToEnqueue = append(statesToEnqueue, orderVariant) + } + if e.optimizations != nil { + e.optimizations.markBranchExpanded(branchStateKey, triggeredByStep, e.Config.Perturbations.PermuteOrder) + } + } else if e.optimizations != nil { + if logOrderingPrune { + pendingIDs := make([]string, len(newState.PendingReconciles)) + for i, pr := range newState.PendingReconciles { + pendingIDs[i] = fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) + } + triggeredIDs := make([]string, len(triggeredByStep)) + for i, pr := range triggeredByStep { + triggeredIDs[i] = fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) + } + logger.Info("ordering pruning: skip expansion for already-expanded state", + "depth", newState.depth, + "branchKey", branchStateKey, + "nodeHash", newState.Hash(), + "orderHash", newState.OrderHash(), + "orderSensitiveKey", orderPruneUseOrderHash, + "contentsHash", newState.ContentsHash(), + "permuteSignature", e.optimizations.permuteSignature(triggeredByStep, e.Config.Perturbations.PermuteOrder), + "pending", pendingIDs, + "triggeredByStep", triggeredIDs, + ) + } + e.stats.SkippedOrderExpansions++ + } + } + + statesToEnqueue = append(statesToEnqueue, newState) + stack, _ = e.enqueueStates(stack, subtreeTracker, statesToEnqueue, useSubtreeCompletion) + return stack +} + +func (e *Explorer) materializeNextState( + stepLogger logr.Logger, + stepCtx context.Context, + stateView StateNode, + stepResult *ReconcileResult, + consumed *PendingReconcile, + nextUserActionIdx int, +) (StateNode, []PendingReconcile) { + stepResult.StateBefore = maps.Clone(stateView.Objects()) + stepResult.KindSeqBefore = maps.Clone(stateView.Contents.KindSequences) + + newContents, newSequences, newStateEvents := e.applyEffects(stepLogger, stateView, stepResult) + triggeredByStep := e.getTriggeredReconcilers(stepResult.Changes) + newPendingReconciles := e.determineNewPendingReconciles(stepCtx, stateView, consumed, stepResult) + stepLogger.V(1).WithValues( + "Depth", stateView.depth, + "Count", len(newPendingReconciles), + "Items", newPendingReconciles, + ).Info("final pending reconciles after step") + + stepResult.StateAfter = newContents + stepResult.KindSeqAfter = newSequences + stepResult.PendingReconciles = newPendingReconciles + + newState := StateNode{ + Contents: NewStateSnapshot(newContents, newSequences, newStateEvents), + PendingReconciles: newPendingReconciles, + parent: &stateView, + action: stepResult, + divergenceKey: stateView.divergenceKey, + stuckReconcilerPositions: maps.Clone(stateView.stuckReconcilerPositions), + ExecutionHistory: append(slices.Clone(stateView.ExecutionHistory), stepResult), + nextUserActionIdx: nextUserActionIdx, + } + newState.ID = string(newState.Hash()) + + return newState, triggeredByStep +} + // emitAbortedState records an aborted exploration branch and attempts to send it on the channel. // Returns true if the context was cancelled before the send completed. func (e *Explorer) emitAbortedState( @@ -1468,6 +1513,42 @@ func (e *Explorer) takeReconcileStep(ctx context.Context, state StateNode, pr Pe return reconcileResult, nil } +// takeUserActionStep executes the next user action for this branch directly +// without routing it through a synthetic pending reconcile. +func (e *Explorer) takeUserActionStep(ctx context.Context, state StateNode) (*ReconcileResult, error) { + stepLog := log.FromContext(ctx) + startWall := time.Now() + defer func() { + if e.stats != nil && e.Config != nil && e.Config.RecordPerfStats { + e.stats.RecordStep(UserControllerID, time.Since(startWall)) + } + }() + + if e.userController == nil { + return nil, fmt.Errorf("user controller is nil") + } + + restoreClock := simclock.SetDepth(state.depth) + defer restoreClock() + + stepLog.WithValues("ActionIdx", state.nextUserActionIdx).V(2).Info("about to execute user action") + + result, err := e.userController.ExecuteNextAction(ctx, state.Objects(), state.nextUserActionIdx) + if err != nil { + if result == nil { + result = &ReconcileResult{ + ControllerID: UserControllerID, + FrameType: FrameTypeExplore, + Error: err.Error(), + } + } + stepLog.WithValues("ActionIdx", state.nextUserActionIdx).Error(err, "error executing user action") + return result, err + } + + return result, nil +} + func (e *Explorer) getNewPendingReconciles(currPending, triggered []PendingReconcile) []PendingReconcile { // Ordering: existing pending first, then newly triggered. // This prevents reconcilers that frequently requeue/trigger from starving others. @@ -1628,30 +1709,36 @@ func dumpQueue(queue []StateNode) []string { return queueStr } -func (e *Explorer) determineNewPendingReconciles(ctx context.Context, state StateNode, reconcileInput PendingReconcile, result *ReconcileResult) []PendingReconcile { +func (e *Explorer) determineNewPendingReconciles(ctx context.Context, state StateNode, consumed *PendingReconcile, result *ReconcileResult) []PendingReconcile { stepLog := log.FromContext(ctx) - // INVARIANT 3: The reconciler taking the step should be present in the previous state's pending reconciles - reconcilerWasPending := false - for _, pr := range state.PendingReconciles { - if pr.ReconcilerID == reconcileInput.ReconcilerID && - pr.Request.NamespacedName == reconcileInput.Request.NamespacedName { - reconcilerWasPending = true - break + stillPending := slices.Clone(state.PendingReconciles) + reconcilerID := "UserController" + // consumed is nil for user action steps, and non-nil for reconcile steps which were once pending. + if consumed != nil { + reconcilerID = string(consumed.ReconcilerID) + // INVARIANT 3: The reconciler taking the step should be present in the previous state's pending reconciles. + reconcilerWasPending := false + for _, pr := range state.PendingReconciles { + if pr.ReconcilerID == consumed.ReconcilerID && + pr.Request.NamespacedName == consumed.Request.NamespacedName { + reconcilerWasPending = true + break + } + } + if !reconcilerWasPending { + stepLog.Error(nil, "INVARIANT VIOLATION: reconciler took step but was not in pending queue", + "reconcilerID", consumed.ReconcilerID, + "request", consumed.Request.NamespacedName, + "pendingCount", len(state.PendingReconciles), + "depth", state.depth) } - } - if !reconcilerWasPending { - stepLog.Error(nil, "INVARIANT VIOLATION: reconciler took step but was not in pending queue", - "reconcilerID", reconcileInput.ReconcilerID, - "request", reconcileInput.Request.NamespacedName, - "pendingCount", len(state.PendingReconciles), - "depth", state.depth) - } - // remove the current reconcile from the pending reconciles list because it has just been processed - stillPending := lo.Filter(state.PendingReconciles, func(pending PendingReconcile, _ int) bool { - return pending != reconcileInput - }) + // Remove the reconcile that just ran. + stillPending = lo.Filter(state.PendingReconciles, func(pending PendingReconcile, _ int) bool { + return pending != *consumed + }) + } // Read captured enqueues from the global collector (from Watch callbacks during reconcile). // Get() automatically clears the collector after returning, so it's ready for the next step. @@ -1661,7 +1748,7 @@ func (e *Explorer) determineNewPendingReconciles(ctx context.Context, state Stat stepLog.V(1).Info("captured async enqueues from tickers", "count", len(capturedPending), "depth", state.depth, - "reconciler", reconcileInput.ReconcilerID, + "ReconcilerID", reconcilerID, "enqueues", capturedPending) } @@ -1675,7 +1762,7 @@ func (e *Explorer) determineNewPendingReconciles(ctx context.Context, state Stat return pr.String() }) logger.WithValues( - "ReconcilerID", reconcileInput.ReconcilerID, + "ReconcilerID", reconcilerID, "TriggeredReconcilers", triggeredIDs, "NumChanges", len(result.Changes.ObjectVersions), ).V(1).Info("reconcilers triggered by changes") @@ -1707,10 +1794,10 @@ func (e *Explorer) determineNewPendingReconciles(ctx context.Context, state Stat // if the controller returned a response with Requeue = true, // we need to requeue the original request, no matter what. - if result.ctrlRes.Requeue { + if consumed != nil && result.ctrlRes.Requeue { requeued := PendingReconcile{ - ReconcilerID: reconcileInput.ReconcilerID, - Request: reconcileInput.Request, + ReconcilerID: consumed.ReconcilerID, + Request: consumed.Request, Source: SourceRequeue, } triggeredByChanges = append(triggeredByChanges, requeued) @@ -1721,11 +1808,11 @@ func (e *Explorer) determineNewPendingReconciles(ctx context.Context, state Stat // INVARIANT 1: If the step had no writes, no new pending reconciles should have "State Change" source wasNoOp := result.wasNoOp() - if wasNoOp { + if consumed != nil && wasNoOp { for _, triggered := range triggeredByChanges { if triggered.Source == SourceStateChange { stepLog.Error(nil, "INVARIANT VIOLATION: no-op step triggered State Change reconcile", - "reconcilerID", reconcileInput.ReconcilerID, + "reconcilerID", consumed.ReconcilerID, "triggeredReconciler", triggered.ReconcilerID, "triggeredRequest", triggered.Request.NamespacedName, "depth", state.depth, @@ -1737,25 +1824,28 @@ func (e *Explorer) determineNewPendingReconciles(ctx context.Context, state Stat // INVARIANT 2: Only the reconciler that just took the step should be removed. // Check that all items in stillPending that were in the original pending are still present, // and the only thing removed is the reconcileInput. - for _, originalPending := range state.PendingReconciles { - if originalPending == reconcileInput { - continue // This one should be removed - } - // Check if it's still pending (either in stillPending or re-added through triggeredByChanges) - foundInStillPending := false - for _, sp := range stillPending { - if sp.ReconcilerID == originalPending.ReconcilerID && - sp.Request.NamespacedName == originalPending.Request.NamespacedName { - foundInStillPending = true - break + // TODO move this out of the hot path and under unit test coverage. + if consumed != nil { + for _, originalPending := range state.PendingReconciles { + if originalPending == *consumed { + continue // This one should be removed + } + // Check if it's still pending (either in stillPending or re-added through triggeredByChanges) + foundInStillPending := false + for _, sp := range stillPending { + if sp.ReconcilerID == originalPending.ReconcilerID && + sp.Request.NamespacedName == originalPending.Request.NamespacedName { + foundInStillPending = true + break + } + } + if !foundInStillPending { + stepLog.Error(nil, "INVARIANT VIOLATION: pending reconcile mysteriously removed (not the one that took the step)", + "removedReconciler", originalPending.ReconcilerID, + "removedRequest", originalPending.Request.NamespacedName, + "stepTakenBy", consumed.ReconcilerID, + "depth", state.depth) } - } - if !foundInStillPending { - stepLog.Error(nil, "INVARIANT VIOLATION: pending reconcile mysteriously removed (not the one that took the step)", - "removedReconciler", originalPending.ReconcilerID, - "removedRequest", originalPending.Request.NamespacedName, - "stepTakenBy", reconcileInput.ReconcilerID, - "depth", state.depth) } } diff --git a/pkg/tracecheck/explore_test.go b/pkg/tracecheck/explore_test.go index ecdb488..4858d09 100644 --- a/pkg/tracecheck/explore_test.go +++ b/pkg/tracecheck/explore_test.go @@ -384,7 +384,7 @@ func Test_determineNewPendingReconciles(t *testing.T) { stuckReconcilerPositions: tt.stuckReconcilerPositions, } - actual := e.determineNewPendingReconciles(context.Background(), state, tt.pendingReconcile, tt.result) + actual := e.determineNewPendingReconciles(context.Background(), state, &tt.pendingReconcile, tt.result) assert.Equal(t, tt.expected, actual) }) } From 7ea3e57f56fa73c5ad91a7ff2f7cae0c61b9b3b1 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 17:54:57 -0800 Subject: [PATCH 07/29] cleaning --- pkg/tracecheck/explore.go | 11 ++--------- pkg/tracecheck/user_controller.go | 2 +- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go index a6a2c11..3fe4b1a 100644 --- a/pkg/tracecheck/explore.go +++ b/pkg/tracecheck/explore.go @@ -1702,21 +1702,14 @@ func (e *Explorer) getPossibleViewsForReconcile(currState StateNode, reconcilerI return filtered, nil } -func dumpQueue(queue []StateNode) []string { - queueStr := lo.Map(queue, func(sn StateNode, _ int) string { - return string(sn.OrderHash()) - }) - return queueStr -} - func (e *Explorer) determineNewPendingReconciles(ctx context.Context, state StateNode, consumed *PendingReconcile, result *ReconcileResult) []PendingReconcile { stepLog := log.FromContext(ctx) stillPending := slices.Clone(state.PendingReconciles) - reconcilerID := "UserController" + reconcilerID := UserControllerID // consumed is nil for user action steps, and non-nil for reconcile steps which were once pending. if consumed != nil { - reconcilerID = string(consumed.ReconcilerID) + reconcilerID = consumed.ReconcilerID // INVARIANT 3: The reconciler taking the step should be present in the previous state's pending reconciles. reconcilerWasPending := false for _, pr := range state.PendingReconciles { diff --git a/pkg/tracecheck/user_controller.go b/pkg/tracecheck/user_controller.go index 7284582..6a89f75 100644 --- a/pkg/tracecheck/user_controller.go +++ b/pkg/tracecheck/user_controller.go @@ -13,7 +13,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) -const UserControllerID ReconcilerID = "UserController" +const UserControllerID ReconcilerID = "External User" type UserAction struct { ID string `json:"id"` From 62e663cda6b3fa82ef79674acff92d6ad6440d68 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 17:55:36 -0800 Subject: [PATCH 08/29] check in knative examples --- examples/knative-serving/inputs-example.json | 2610 ++++++++--------- examples/knative-serving/one-input.json | 49 + .../knative-serving/two-step-workflow.json | 82 + 3 files changed, 1404 insertions(+), 1337 deletions(-) create mode 100644 examples/knative-serving/one-input.json create mode 100644 examples/knative-serving/two-step-workflow.json diff --git a/examples/knative-serving/inputs-example.json b/examples/knative-serving/inputs-example.json index 6929979..e3c487e 100644 --- a/examples/knative-serving/inputs-example.json +++ b/examples/knative-serving/inputs-example.json @@ -1,1625 +1,1561 @@ [ { "name": "knative-default/base", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" - }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] - } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" - } - } - ], + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/image-v2", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test:v2", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/image-v2", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/image-v3", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test:v3", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test:v2", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/image-v3", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/min-scale-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "1" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test:v3", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/min-scale-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/min-scale-2", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "2" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "1" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/min-scale-2", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/max-scale-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/max-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "2" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/max-scale-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/max-scale-5", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/max-scale": "5", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/max-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/max-scale-5", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/max-scale-10", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/max-scale": "10", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/max-scale": "5", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/max-scale-10", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/concurrency-0", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/max-scale": "10", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 0, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/concurrency-0", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/concurrency-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 1, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 0, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/concurrency-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/concurrency-10", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 10, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 1, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/concurrency-10", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/single/concurrency-100", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 100, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 10, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/single/concurrency-100", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-001/max-scale-10", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/max-scale": "10", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 100, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-001/max-scale-10", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-002/image-v2", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/max-scale": "10", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test:v2", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-002/image-v2", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-003/concurrency-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 1, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test:v2", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-003/concurrency-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-004/concurrency-100+min-scale-2", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "2" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 100, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 1, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-004/concurrency-100+min-scale-2", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-005/concurrency-0", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "2" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 0, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 100, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-005/concurrency-0", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-006/image-v2+max-scale-10", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/max-scale": "10", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test:v2", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 0, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-006/image-v2+max-scale-10", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-007/concurrency-10+min-scale-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "1" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/max-scale": "10", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 10, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test:v2", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-007/concurrency-10+min-scale-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-008/concurrency-0+min-scale-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "1" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "1" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 0, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 10, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-008/concurrency-0+min-scale-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-009/max-scale-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/max-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "1" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 0, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-009/max-scale-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-010/min-scale-2", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "2" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/max-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-010/min-scale-2", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-011/min-scale-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "1" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "2" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-011/min-scale-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-012/concurrency-1+max-scale-10", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/max-scale": "10", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "1" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 1, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-012/concurrency-1+max-scale-10", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-013/concurrency-10", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/max-scale": "10", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 10, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 1, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-013/concurrency-10", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-014/max-scale-5", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/max-scale": "5", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 10, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-014/max-scale-5", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-015/image-v3+min-scale-2", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "2" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/max-scale": "5", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test:v3", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-015/image-v3+min-scale-2", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-016/image-v3", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "2" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test:v3", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test:v3", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-016/image-v3", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-017/concurrency-1+image-v3", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 1, - "containers": [ - { - "image": "dev.local/test:v3", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test:v3", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-017/concurrency-1+image-v3", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-018/concurrency-1+min-scale-2", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "2" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 1, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 1, + "containers": [ + { + "image": "dev.local/test:v3", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-018/concurrency-1+min-scale-2", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-019/image-v2+min-scale-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/min-scale": "1" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "2" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containers": [ - { - "image": "dev.local/test:v2", - "name": "", - "resources": {} - } - ] + "spec": { + "containerConcurrency": 1, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-019/image-v2+min-scale-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } - }, - { - "name": "knative-default/sampled-020/concurrency-1+max-scale-1", - "objects": [ - { - "apiVersion": "serving.knative.dev/v1", - "kind": "Service", - "metadata": { - "name": "demo", - "namespace": "default" - }, - "spec": { - "template": { - "metadata": { - "annotations": { - "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", - "autoscaling.knative.dev/initial-scale": "1", - "autoscaling.knative.dev/max-scale": "1", - "autoscaling.knative.dev/min-scale": "0" + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "1" + }, + "name": "kamera-test" }, - "name": "kamera-test" - }, - "spec": { - "containerConcurrency": 1, - "containers": [ - { - "image": "dev.local/test", - "name": "", - "resources": {} - } - ] + "spec": { + "containers": [ + { + "image": "dev.local/test:v2", + "name": "", + "resources": {} + } + ] + } } - } - }, - "status": {} - } - ], - "pending": [ - { - "controllerId": "ServiceReconciler", - "key": { - "namespace": "default", - "name": "demo" + }, + "status": {} } } - ], + ] + }, + { + "name": "knative-default/sampled-020/concurrency-1+max-scale-1", + "environmentState": { + "objects": [] + }, "tuning": { "maxDepth": 0, "permuteControllers": null, "staleReads": null, "staleLookback": null - } + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/max-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" + }, + "spec": { + "containerConcurrency": 1, + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } + } + }, + "status": {} + } + } + ] } -] \ No newline at end of file +] diff --git a/examples/knative-serving/one-input.json b/examples/knative-serving/one-input.json new file mode 100644 index 0000000..13f6629 --- /dev/null +++ b/examples/knative-serving/one-input.json @@ -0,0 +1,49 @@ +[ + { + "name": "knative-default/base", + "environmentState": { + "objects": [] + }, + "tuning": { + "maxDepth": 0, + "permuteControllers": null, + "staleReads": null, + "staleLookback": null + }, + "userInputs": [ + { + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" + }, + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } + } + } + } + } + ] + } +] diff --git a/examples/knative-serving/two-step-workflow.json b/examples/knative-serving/two-step-workflow.json new file mode 100644 index 0000000..d3377b8 --- /dev/null +++ b/examples/knative-serving/two-step-workflow.json @@ -0,0 +1,82 @@ +[ + { + "name": "knative-default/two-step-workflow", + "environmentState": { + "objects": [] + }, + "tuning": { + "maxDepth": 0, + "permuteControllers": null, + "staleReads": null, + "staleLookback": null + }, + "userInputs": [ + { + "id": "initial create", + "type": "CREATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "1", + "autoscaling.knative.dev/min-scale": "0" + }, + "name": "kamera-test" + }, + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } + } + } + } + }, + { + "id": "update to 2 replicas", + "type": "UPDATE", + "object": { + "apiVersion": "serving.knative.dev/v1", + "kind": "Service", + "metadata": { + "name": "demo", + "namespace": "default" + }, + "spec": { + "template": { + "metadata": { + "annotations": { + "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", + "autoscaling.knative.dev/initial-scale": "2", + "autoscaling.knative.dev/min-scale": "2" + }, + "name": "kamera-test" + }, + "spec": { + "containers": [ + { + "image": "dev.local/test", + "name": "", + "resources": {} + } + ] + } + } + } + } + } + ] + } +] \ No newline at end of file From 08a0b79e49e298212a32724c2f8cac5f1540b871 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 17:55:55 -0800 Subject: [PATCH 09/29] update old design docs to be consistent with current approach --- ...026-01-30-coverage-hotspot-input-design.md | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/docs/plans/2026-01-30-coverage-hotspot-input-design.md b/docs/plans/2026-01-30-coverage-hotspot-input-design.md index 76ffbdf..7db9c05 100644 --- a/docs/plans/2026-01-30-coverage-hotspot-input-design.md +++ b/docs/plans/2026-01-30-coverage-hotspot-input-design.md @@ -4,7 +4,7 @@ Define a concrete, deterministic translation from a `HotspotInstance` (from static dependency graphs) to a simple `Input` representation suitable for later conversion into a `Scenario`. This step intentionally excludes “dimensions of variation,” which will be handled later. ## Scope -- **In:** Hotspot → Input translation, object materialization from the input map, pending reconcile construction, and tuning fields. +- **In:** Hotspot → Input translation, object materialization from the input map, and tuning fields. - **Out:** Dimension expansion (Input → []Input) and any tracecheck-specific configuration details. ## Data Types (pkg/coverage) @@ -12,20 +12,20 @@ Keep the translation layer independent of `tracecheck`. ``` type Input struct { - Name string - Objects []*unstructured.Unstructured - Pending []Pending - Tuning InputTuning + Name string + EnvironmentState EnvironmentState + UserInputs []UserInput + Tuning InputTuning } -type Pending struct { - ControllerID string - Key NamespacedName +type EnvironmentState struct { + Objects []*unstructured.Unstructured } -type NamespacedName struct { - Namespace string - Name string +type UserInput struct { + ID string + Type string + Object *unstructured.Unstructured } type InputTuning struct { @@ -48,7 +48,6 @@ type InputTemplate struct { ``` Notes: -- `Pending` is explicit and deterministic; conversion to `tracecheck.PendingReconcile` happens later. - `StaleReads` is keyed by controller ID; `StaleLookback` is keyed by canonical GroupKind. ## Input Map Assumption @@ -78,10 +77,8 @@ Inputs: `HotspotInstance`, dependency graph lookup, input map. - Deduplicate by GVK. 2. **Resolve objects:** - For each GVK, look up a template, deep-copy, normalize, and store. -3. **Build Pending:** - - For each hotspot controller, find its **primary reconciles target** in the graph. - - If multiple reconciles targets exist, emit a **warning** and choose deterministically (lexicographic GVK). - - Create a `Pending` using the normalized object’s name/namespace for that GVK. +3. **Build user inputs:** + - For each resolved resource, create a `UserInput` with `type=CREATE` and the normalized object. 4. **Set Tuning (compact hints):** - **Multi-writer / Diamond / Feedback cycle:** `PermuteControllers = hotspot.Controllers`. - **Missing trigger / Reducer:** `StaleReads[reader] = []groupKind{...}` for referenced inputs. From 1b1c66c2aedb59c4fdd6e577c31b5a10aabf3388 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:01:07 -0800 Subject: [PATCH 10/29] update knative harness --- examples/knative-serving/main.go | 20 +- examples/knative-serving/main_batch_test.go | 18 +- examples/knative-serving/scenario.go | 214 ++++++++++++++------ examples/knative-serving/scenario_test.go | 38 ++-- 4 files changed, 187 insertions(+), 103 deletions(-) diff --git a/examples/knative-serving/main.go b/examples/knative-serving/main.go index 5c283e6..22eba4d 100644 --- a/examples/knative-serving/main.go +++ b/examples/knative-serving/main.go @@ -175,14 +175,24 @@ func main() { } opts := explore.ParallelOptions{DumpDir: explore.DumpPath(), StatsDir: explore.DumpStatsPath()} - fmt.Fprintln(os.Stderr, "closed-loop scaffold: running per-input reference->rerun pipelines") - scenarios, err := scenariosFromInputsWithClosedLoop(builder, inputs) + if explore.PerturbEnabled() { + fmt.Fprintln(os.Stderr, "closed-loop scaffold: running per-input reference->rerun pipelines") + } else { + fmt.Fprintln(os.Stderr, "single-pass scaffold: running per-input reference-only simulation") + } + + builderFn := scenariosFromInputsWithClosedLoop + if !explore.PerturbEnabled() { + builderFn = scenariosFromInputs + } + + scenarios, err := builderFn(builder, inputs) if err != nil { - fmt.Fprintf(os.Stderr, "build closed-loop scenarios: %v\n", err) + fmt.Fprintf(os.Stderr, "build scenarios: %v\n", err) os.Exit(1) } if _, err := runner.RunAll(ctx, scenarios, opts); err != nil { - fmt.Fprintf(os.Stderr, "closed-loop batch run error: %v\n", err) + fmt.Fprintf(os.Stderr, "batch run error: %v\n", err) os.Exit(1) } return @@ -193,7 +203,7 @@ func main() { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) os.Exit(1) } - if err := runner.Run(ctx, initialState); err != nil { + if err := runner.Run(ctx, explore.RunInput{EnvironmentState: initialState}); err != nil { fmt.Fprintf(os.Stderr, "session error: %v\n", err) os.Exit(1) } diff --git a/examples/knative-serving/main_batch_test.go b/examples/knative-serving/main_batch_test.go index ca3ad49..d1a3221 100644 --- a/examples/knative-serving/main_batch_test.go +++ b/examples/knative-serving/main_batch_test.go @@ -36,7 +36,7 @@ func TestBatchInputsForRunParallelWithoutInputsUsesDefaults(t *testing.T) { if inputs[0].Name != "knative-default/base" { t.Fatalf("expected first default scenario name, got %#v", inputs[0].Name) } - if len(inputs[0].Objects) == 0 { + if len(inputs[0].EnvironmentState.Objects) == 0 { t.Fatalf("expected default input objects, got %#v", inputs[0]) } } @@ -146,13 +146,15 @@ func validInputs(name string) []coverage.Input { return []coverage.Input{ { Name: name, - Objects: []*unstructured.Unstructured{ - { - Object: map[string]any{ - "apiVersion": "v1", - "kind": "ConfigMap", - "metadata": map[string]any{ - "name": "sample", + EnvironmentState: coverage.EnvironmentState{ + Objects: []*unstructured.Unstructured{ + { + Object: map[string]any{ + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": map[string]any{ + "name": "sample", + }, }, }, }, diff --git a/examples/knative-serving/scenario.go b/examples/knative-serving/scenario.go index 98066fc..fee23e1 100644 --- a/examples/knative-serving/scenario.go +++ b/examples/knative-serving/scenario.go @@ -13,6 +13,7 @@ import ( knativeharness "github.com/tgoodwin/kamera/examples/knative-serving/knative" "github.com/tgoodwin/kamera/pkg/coverage" + "github.com/tgoodwin/kamera/pkg/event" "github.com/tgoodwin/kamera/pkg/explore" "github.com/tgoodwin/kamera/pkg/replay" "github.com/tgoodwin/kamera/pkg/simclock" @@ -236,16 +237,21 @@ func scenariosFromInputsForPhase( includePerturbations := phase != scenarioPhaseReference scenarios := make([]explore.Scenario, 0, len(inputs)) for idx, input := range inputs { - state, err := buildStateFromCoverageInput(builder, input) + state, seededObjects, err := buildStateFromCoverageInput(builder, input) if err != nil { return nil, fmt.Errorf("build start state for input %d (%s): %w", idx, input.Name, err) } + userInputs, err := buildUserActionsFromCoverageInput(input, seededObjects) + if err != nil { + return nil, fmt.Errorf("build user actions for input %d (%s): %w", idx, input.Name, err) + } scenarios = append(scenarios, explore.Scenario{ - Name: input.Name, - InitialState: state, - Config: applyInputTuningForPhase(baseCfg, input.Tuning, includePerturbations), - Context: scenarioContextForInput(input, phase), + Name: input.Name, + EnvironmentState: state, + UserInputs: userInputs, + Config: applyInputTuningForPhase(baseCfg, input.Tuning, includePerturbations), + Context: scenarioContextForInput(input, phase), }) } @@ -269,20 +275,25 @@ func scenariosFromInputsWithClosedLoop( referenceBase := withoutPerturbations(builder.Config()) scenarios := make([]explore.Scenario, 0, len(inputs)) for idx, input := range inputs { - state, err := buildStateFromCoverageInput(builder, input) + state, seededObjects, err := buildStateFromCoverageInput(builder, input) if err != nil { return nil, fmt.Errorf("build start state for input %d (%s): %w", idx, input.Name, err) } + userInputs, err := buildUserActionsFromCoverageInput(input, seededObjects) + if err != nil { + return nil, fmt.Errorf("build user actions for input %d (%s): %w", idx, input.Name, err) + } referenceCfg := applyInputTuningForPhase(referenceBase, input.Tuning, false) rerunPlanCfg := referenceCfg.Clone() rerunContext := scenarioContextForInput(input, scenarioPhaseRerun) scenarios = append(scenarios, explore.Scenario{ - Name: input.Name, - InitialState: state, - Config: referenceCfg, - Context: scenarioContextForInput(input, scenarioPhaseReference), + Name: input.Name, + EnvironmentState: state, + UserInputs: userInputs, + Config: referenceCfg, + Context: scenarioContextForInput(input, scenarioPhaseReference), ClosedLoop: &explore.ClosedLoopSpec{ Plan: func(reference explore.ScenarioPhaseResult) ([]explore.ScenarioPhasePlan, error) { plan, ok, err := buildV0CheckpointRerunPlan(reference, rerunPlanCfg, rerunContext) @@ -436,12 +447,12 @@ func expandKnativeParameterizedInput(input coverage.Input, fuzzCases int, fuzzSe base := cloneCoverageInput(input) base.Name = baseName + "/base" - serviceIdx := findKnativeService(base.Objects) + serviceIdx := findKnativeServiceInUserInputs(base.UserInputs) if serviceIdx < 0 { return []coverage.Input{base}, nil } - templateSvc, err := unstructuredToService(base.Objects[serviceIdx]) + templateSvc, err := unstructuredToService(base.UserInputs[serviceIdx].Object) if err != nil { return nil, err } @@ -502,7 +513,12 @@ func knativeParamCatalog() []knativeParamSpec { } } -func expandKnativeSingleParamVariants(input coverage.Input, baseName string, serviceIdx int, templateSvc *v1.Service) ([]coverage.Input, error) { +func expandKnativeSingleParamVariants( + input coverage.Input, + baseName string, + serviceIdx int, + templateSvc *v1.Service, +) ([]coverage.Input, error) { variants := make([]coverage.Input, 0) for _, spec := range knativeParamCatalog() { for _, option := range spec.options { @@ -520,7 +536,14 @@ func expandKnativeSingleParamVariants(input coverage.Input, baseName string, ser return variants, nil } -func expandKnativeSampledParamVariants(input coverage.Input, baseName string, serviceIdx int, templateSvc *v1.Service, cases int, seed int64) ([]coverage.Input, error) { +func expandKnativeSampledParamVariants( + input coverage.Input, + baseName string, + serviceIdx int, + templateSvc *v1.Service, + cases int, + seed int64, +) ([]coverage.Input, error) { if cases <= 0 { return nil, nil } @@ -566,7 +589,13 @@ func expandKnativeSampledParamVariants(input coverage.Input, baseName string, se return variants, nil } -func buildKnativeVariantInput(input coverage.Input, serviceIdx int, templateSvc *v1.Service, name string, options []knativeParamOption) (coverage.Input, error) { +func buildKnativeVariantInput( + input coverage.Input, + serviceIdx int, + templateSvc *v1.Service, + name string, + options []knativeParamOption, +) (coverage.Input, error) { updated := cloneCoverageInput(input) updated.Name = name @@ -583,7 +612,10 @@ func buildKnativeVariantInput(input coverage.Input, serviceIdx int, templateSvc if err != nil { return coverage.Input{}, fmt.Errorf("convert parameterized service for %q: %w", name, err) } - updated.Objects[serviceIdx] = serviceObj + if serviceIdx >= len(updated.UserInputs) || updated.UserInputs[serviceIdx].Object == nil { + return coverage.Input{}, fmt.Errorf("service user input missing for %q", name) + } + updated.UserInputs[serviceIdx].Object = serviceObj return updated, nil } @@ -702,55 +734,29 @@ func parseScaleAnnotation(svc *v1.Service, key string) (int64, bool, error) { return value, true, nil } -func buildStateFromCoverageInput(builder *tracecheck.ExplorerBuilder, input coverage.Input) (tracecheck.StateNode, error) { - if len(input.Objects) == 0 { - return tracecheck.StateNode{}, fmt.Errorf("input has no objects") - } - - objects := make([]client.Object, 0, len(input.Objects)) - for idx, obj := range input.Objects { +func buildStateFromCoverageInput(builder *tracecheck.ExplorerBuilder, input coverage.Input) (tracecheck.StateNode, []client.Object, error) { + objects := make([]client.Object, 0, len(input.EnvironmentState.Objects)) + for _, obj := range input.EnvironmentState.Objects { if obj == nil { - return tracecheck.StateNode{}, fmt.Errorf("input object %d is nil", idx) + continue } clone := obj.DeepCopy() - tag.AddSleeveObjectID(clone) objects = append(objects, clone) } - - pending := make([]tracecheck.PendingReconcile, 0, len(input.Pending)) - for _, p := range input.Pending { - pending = append(pending, tracecheck.PendingReconcile{ - ReconcilerID: tracecheck.ReconcilerID(p.ControllerID), - Request: reconcile.Request{ - NamespacedName: client.ObjectKey{ - Namespace: p.Key.Namespace, - Name: p.Key.Name, - }, - }, - Source: tracecheck.SourceStateChange, - }) + if len(objects) == 0 { + baseSvc := buildBaselineService() + objects = append(objects, baseSvc) } - if len(pending) == 0 { - for _, obj := range objects { - if isKnativeService(obj) { - pending = append(pending, tracecheck.PendingReconcile{ - ReconcilerID: "ServiceReconciler", - Request: reconcile.Request{ - NamespacedName: client.ObjectKey{ - Namespace: obj.GetNamespace(), - Name: obj.GetName(), - }, - }, - Source: tracecheck.SourceStateChange, - }) - } - } - } - if len(pending) == 0 { - return tracecheck.StateNode{}, fmt.Errorf("input has no pending reconciles") + if len(objects) == 0 { + return tracecheck.StateNode{}, nil, fmt.Errorf("input has no objects") } - return builder.BuildStartStateFromObjects(objects, pending) + pending := make([]tracecheck.PendingReconcile, 0) + state, err := builder.BuildStartStateFromObjects(objects, pending) + if err != nil { + return tracecheck.StateNode{}, nil, err + } + return state, objects, nil } func applyInputTuning(base tracecheck.ExploreConfig, tuning coverage.InputTuning) tracecheck.ExploreConfig { @@ -820,16 +826,16 @@ func withoutPerturbations(cfg tracecheck.ExploreConfig) tracecheck.ExploreConfig } func cloneCoverageInput(input coverage.Input) coverage.Input { - objects := make([]*unstructured.Unstructured, 0, len(input.Objects)) - for _, obj := range input.Objects { + objects := make([]*unstructured.Unstructured, 0, len(input.EnvironmentState.Objects)) + for _, obj := range input.EnvironmentState.Objects { if obj == nil { objects = append(objects, nil) continue } objects = append(objects, obj.DeepCopy()) } + userInputs := cloneUserInputs(input.UserInputs) - pending := append([]coverage.Pending(nil), input.Pending...) tuning := coverage.InputTuning{ MaxDepth: input.Tuning.MaxDepth, PermuteControllers: append([]string(nil), input.Tuning.PermuteControllers...), @@ -837,11 +843,79 @@ func cloneCoverageInput(input coverage.Input) coverage.Input { StaleLookback: cloneIntMap(input.Tuning.StaleLookback), } return coverage.Input{ - Name: input.Name, - Objects: objects, - Pending: pending, - Tuning: tuning, + Name: input.Name, + EnvironmentState: coverage.EnvironmentState{ + Objects: objects, + }, + UserInputs: userInputs, + Tuning: tuning, + } +} + +func buildUserActionsFromCoverageInput(input coverage.Input, seededObjects []client.Object) ([]tracecheck.UserAction, error) { + actions := make([]tracecheck.UserAction, 0, len(input.UserInputs)) + for idx, action := range input.UserInputs { + if action.Object == nil { + return nil, fmt.Errorf("input user input %d has nil object", idx) + } + id := strings.TrimSpace(action.ID) + if id == "" { + id = fmt.Sprintf("user-input-%d", idx) + } + opType := action.Type + if opType == event.CREATE && isInputObjectSeeded(action.Object, seededObjects) { + opType = event.UPDATE + } + actions = append(actions, tracecheck.UserAction{ + ID: id, + OpType: opType, + Payload: action.Object.DeepCopy(), + }) } + return actions, nil +} + +func isInputObjectSeeded(object client.Object, seededObjects []client.Object) bool { + if object == nil { + return false + } + for _, seeded := range seededObjects { + if sameObjectIdentity(seeded, object) { + return true + } + } + return false +} + +func sameObjectIdentity(a, b client.Object) bool { + if a == nil || b == nil { + return false + } + aGVK := a.GetObjectKind().GroupVersionKind() + bGVK := b.GetObjectKind().GroupVersionKind() + if aGVK.Group != bGVK.Group || aGVK.Kind != bGVK.Kind { + return false + } + return a.GetNamespace() == b.GetNamespace() && a.GetName() == b.GetName() +} + +func cloneUserInputs(inputs []coverage.UserInput) []coverage.UserInput { + if len(inputs) == 0 { + return nil + } + out := make([]coverage.UserInput, 0, len(inputs)) + for _, input := range inputs { + clone := coverage.UserInput{ + ID: input.ID, + Type: input.Type, + Object: nil, + } + if input.Object != nil { + clone.Object = input.Object.DeepCopy() + } + out = append(out, clone) + } + return out } func cloneStringSliceMap(in map[string][]string) map[string][]string { @@ -878,6 +952,18 @@ func findKnativeService(objects []*unstructured.Unstructured) int { return -1 } +func findKnativeServiceInUserInputs(userInputs []coverage.UserInput) int { + for idx, input := range userInputs { + if input.Object == nil { + continue + } + if isKnativeService(input.Object) { + return idx + } + } + return -1 +} + func isKnativeService(obj client.Object) bool { if obj == nil { return false diff --git a/examples/knative-serving/scenario_test.go b/examples/knative-serving/scenario_test.go index 2aba4cf..99e1520 100644 --- a/examples/knative-serving/scenario_test.go +++ b/examples/knative-serving/scenario_test.go @@ -28,23 +28,18 @@ func TestScenariosFromInputsUsesInputsAsFinalScenarioUnits(t *testing.T) { inputA := coverage.Input{ Name: "knative-a", - Objects: []*unstructured.Unstructured{ - mustServiceAsUnstructured(t), - }, - Pending: []coverage.Pending{ - { - ControllerID: "ServiceReconciler", - Key: coverage.NamespacedName{ - Namespace: "default", - Name: "demo", - }, + EnvironmentState: coverage.EnvironmentState{ + Objects: []*unstructured.Unstructured{ + mustServiceAsUnstructured(t), }, }, } inputB := coverage.Input{ Name: "knative-b", - Objects: []*unstructured.Unstructured{ - mustServiceAsUnstructured(t), + EnvironmentState: coverage.EnvironmentState{ + Objects: []*unstructured.Unstructured{ + mustServiceAsUnstructured(t), + }, }, } @@ -58,12 +53,10 @@ func TestScenariosFromInputsUsesInputsAsFinalScenarioUnits(t *testing.T) { seen := map[string]bool{} for _, sc := range scenarios { - if len(sc.InitialState.Objects()) == 0 { + if len(sc.EnvironmentState.Objects()) == 0 { t.Fatalf("scenario %q has empty state objects", sc.Name) } - if len(sc.InitialState.PendingReconciles) == 0 { - t.Fatalf("scenario %q has no pending reconciles", sc.Name) - } + // scenarios with zero userInputs should remain valid and run as no-op setups seen[sc.Name] = true } if !seen["knative-a"] || !seen["knative-b"] { @@ -175,16 +168,9 @@ func TestScenariosFromInputsWithClosedLoopBuildsReferenceAndRerunPlans(t *testin builder := newKnativeExplorerBuilder() input := coverage.Input{ Name: "knative-a", - Objects: []*unstructured.Unstructured{ - mustServiceAsUnstructured(t), - }, - Pending: []coverage.Pending{ - { - ControllerID: "ServiceReconciler", - Key: coverage.NamespacedName{ - Namespace: "default", - Name: "demo", - }, + EnvironmentState: coverage.EnvironmentState{ + Objects: []*unstructured.Unstructured{ + mustServiceAsUnstructured(t), }, }, Tuning: coverage.InputTuning{ From 6705985ca48de1b71819cdddbd3f914e72741bf0 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:08:51 -0800 Subject: [PATCH 11/29] bugfix: increment generation for UPDATE and PATCH events in addition to APPLY events --- pkg/tracecheck/explore.go | 6 ++- pkg/tracecheck/explore_apply_test.go | 57 ++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go index 3fe4b1a..4f679c7 100644 --- a/pkg/tracecheck/explore.go +++ b/pkg/tracecheck/explore.go @@ -1321,11 +1321,12 @@ func (e *Explorer) applyEffects(stepLogger logr.Logger, stateView StateNode, ste // going through. If it does go through, we should panic cause something broke. panic("update effect object not found in prev state: " + effect.Key.String()) } + oldVersion := nextState[existingKey] if exists && existingKey != effect.Key { delete(nextState, existingKey) } // Mimic APIServer behavior: increment Generation on spec updates (not status-only updates) - oldObj := e.versionManager.Resolve(nextState[existingKey]) + oldObj := e.versionManager.Resolve(oldVersion) newObj := e.versionManager.Resolve(changes[effect.Key]) if oldObj != nil && newObj != nil { // Compare specs to determine if Generation should be incremented @@ -1376,7 +1377,8 @@ func (e *Explorer) applyEffects(stepLogger logr.Logger, stateView StateNode, ste } // Mimic APIServer behavior: increment Generation on spec updates (not status-only updates) - oldObj := e.versionManager.Resolve(nextState[existingKey]) + oldVersion := nextState[existingKey] + oldObj := e.versionManager.Resolve(oldVersion) newObj := e.versionManager.Resolve(changes[effect.Key]) if oldObj != nil && newObj != nil { // Compare specs to determine if Generation should be incremented diff --git a/pkg/tracecheck/explore_apply_test.go b/pkg/tracecheck/explore_apply_test.go index bd8aea2..840781b 100644 --- a/pkg/tracecheck/explore_apply_test.go +++ b/pkg/tracecheck/explore_apply_test.go @@ -51,3 +51,60 @@ func TestApplyEffects_ApplyCreatesObject(t *testing.T) { require.NotNil(t, applied, "expected applied object to resolve") require.Equal(t, int64(1), applied.GetGeneration(), "expected generation to be set on apply create") } + +func TestApplyEffects_UpdateIncrementsGenerationWhenKeyIdentityChanges(t *testing.T) { + store := snapshot.NewStore() + vs := NewVersionStore(store, nil) + explorer := &Explorer{versionManager: vs} + + oldObj := &unstructured.Unstructured{} + oldObj.SetAPIVersion("v1") + oldObj.SetKind("ConfigMap") + oldObj.SetNamespace("default") + oldObj.SetName("example") + oldObj.SetGeneration(3) + oldObj.Object["spec"] = map[string]any{"message": "old"} + + oldKey := snapshot.NewCompositeKeyWithGroup("", "ConfigMap", "default", "example", "obj-old") + oldHash := vs.Publish(oldObj) + + newObj := oldObj.DeepCopy() + newObj.SetGeneration(0) + newObj.Object["spec"] = map[string]any{"message": "new"} + + newKey := snapshot.NewCompositeKeyWithGroup("", "ConfigMap", "default", "example", "obj-new") + newHash := vs.Publish(newObj) + + stepResult := &ReconcileResult{ + Changes: Changes{ + ObjectVersions: ObjectVersions{newKey: newHash}, + Effects: []Effect{ + { + OpType: event.UPDATE, + Key: newKey, + Version: newHash, + }, + }, + }, + } + + state := StateNode{ + Contents: NewStateSnapshot( + ObjectVersions{oldKey: oldHash}, + KindSequences{oldKey.CanonicalGroupKind(): 1}, + nil, + ), + } + + nextState, _, _ := explorer.applyEffects(logr.Discard(), state, stepResult) + + _, oldStillPresent := nextState[oldKey] + require.False(t, oldStillPresent, "expected old key to be replaced") + + updatedHash, ok := nextState[newKey] + require.True(t, ok, "expected updated object to exist under new key") + updated := vs.Resolve(updatedHash) + require.NotNil(t, updated, "expected updated object to resolve") + require.Equal(t, int64(4), updated.GetGeneration(), "expected generation increment from old object") + require.Equal(t, "new", updated.Object["spec"].(map[string]any)["message"]) +} From 77d3e8e0de8ba97347fdc16c38a79f9380372714 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:30:54 -0800 Subject: [PATCH 12/29] make generation incrementation logic more robust to weird JSON edge cases --- pkg/tracecheck/explore.go | 12 +-- pkg/tracecheck/explore_apply_test.go | 121 +++++++++++++++++++++++++++ pkg/tracecheck/state.go | 2 +- pkg/tracecheck/trigger.go | 4 +- pkg/util/object_generation.go | 55 ++++++++++++ 5 files changed, 185 insertions(+), 9 deletions(-) create mode 100644 pkg/util/object_generation.go diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go index 4f679c7..9411fc0 100644 --- a/pkg/tracecheck/explore.go +++ b/pkg/tracecheck/explore.go @@ -148,7 +148,7 @@ func (e *Explorer) shouldApplyNextUserAction(state StateNode) bool { return false } // policy here is to apply the next user action after the current state has converged - return len(state.PendingReconciles) == 0 || allPendingIgnorableForConvergence(state.PendingReconciles) + return len(state.PendingReconciles) == 0 || AllPendingIgnorableForConvergence(state.PendingReconciles) } // Objects resolves and returns all objects for the provided ResultState, skipping any that cannot be resolved. @@ -809,7 +809,7 @@ func (e *Explorer) explore( // NOTE: If a state has ANY SourceStateChange pending reconciles, it should NOT be // considered converged. The allPendingIgnorableForConvergence function returns false // if any pending has SourceStateChange. - if len(currentState.PendingReconciles) == 0 || allPendingIgnorableForConvergence(currentState.PendingReconciles) { + if len(currentState.PendingReconciles) == 0 || AllPendingIgnorableForConvergence(currentState.PendingReconciles) { convergenceKey := currentState.ConvergenceHash() reason := "no pending reconciles" if len(currentState.PendingReconciles) > 0 { @@ -1305,7 +1305,7 @@ func (e *Explorer) applyEffects(stepLogger logr.Logger, stateView StateNode, ste // Mimic APIServer behavior: set Generation to 1 on CREATE if not already set newObj := e.versionManager.Resolve(changes[effect.Key]) if newObj != nil { - gen := newObj.GetGeneration() + gen := util.GetObjectGeneration(newObj) if gen == 0 { newObj.SetGeneration(1) // Update the version hash after modifying Generation @@ -1340,7 +1340,7 @@ func (e *Explorer) applyEffects(stepLogger logr.Logger, stateView StateNode, ste specChanged = true } if specChanged { - oldGen := oldObj.GetGeneration() + oldGen := util.GetObjectGeneration(oldObj) if oldGen == 0 { // If Generation is 0, set it to 1 (shouldn't happen in real K8s, but handle gracefully) // This can happen if the state snapshot has objects with Generation=0 @@ -1361,7 +1361,7 @@ func (e *Explorer) applyEffects(stepLogger logr.Logger, stateView StateNode, ste // Apply has upsert semantics; if the object doesn't exist, treat as CREATE. newObj := e.versionManager.Resolve(changes[effect.Key]) if newObj != nil { - gen := newObj.GetGeneration() + gen := util.GetObjectGeneration(newObj) if gen == 0 { newObj.SetGeneration(1) // Update the version hash after modifying Generation @@ -1392,7 +1392,7 @@ func (e *Explorer) applyEffects(stepLogger logr.Logger, stateView StateNode, ste specChanged = true } if specChanged { - oldGen := oldObj.GetGeneration() + oldGen := util.GetObjectGeneration(oldObj) if oldGen == 0 { // If Generation is 0, set it to 1 (shouldn't happen in real K8s, but handle gracefully) // This can happen if the state snapshot has objects with Generation=0 diff --git a/pkg/tracecheck/explore_apply_test.go b/pkg/tracecheck/explore_apply_test.go index 840781b..7c52e39 100644 --- a/pkg/tracecheck/explore_apply_test.go +++ b/pkg/tracecheck/explore_apply_test.go @@ -108,3 +108,124 @@ func TestApplyEffects_UpdateIncrementsGenerationWhenKeyIdentityChanges(t *testin require.Equal(t, int64(4), updated.GetGeneration(), "expected generation increment from old object") require.Equal(t, "new", updated.Object["spec"].(map[string]any)["message"]) } + +func TestApplyEffects_UpdateIncrementsGenerationFromNumericMetadataField(t *testing.T) { + store := snapshot.NewStore() + vs := NewVersionStore(store, nil) + explorer := &Explorer{versionManager: vs} + + key := snapshot.NewCompositeKeyWithGroup("", "ConfigMap", "default", "example", "obj1") + + oldObj := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": map[string]any{ + "namespace": "default", + "name": "example", + "generation": float64(5), + }, + "spec": map[string]any{"message": "old"}, + }, + } + oldHash := vs.Publish(oldObj) + + newObj := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": map[string]any{ + "namespace": "default", + "name": "example", + }, + "spec": map[string]any{"message": "new"}, + }, + } + newHash := vs.Publish(newObj) + + stepResult := &ReconcileResult{ + Changes: Changes{ + ObjectVersions: ObjectVersions{key: newHash}, + Effects: []Effect{ + { + OpType: event.UPDATE, + Key: key, + Version: newHash, + }, + }, + }, + } + + state := StateNode{ + Contents: NewStateSnapshot( + ObjectVersions{key: oldHash}, + KindSequences{key.CanonicalGroupKind(): 1}, + nil, + ), + } + + nextState, _, _ := explorer.applyEffects(logr.Discard(), state, stepResult) + + updatedHash, ok := nextState[key] + require.True(t, ok, "expected updated object to exist") + updated := vs.Resolve(updatedHash) + require.NotNil(t, updated, "expected updated object to resolve") + require.Equal(t, int64(6), updated.GetGeneration(), "expected generation increment from numeric metadata field") +} + +func TestApplyEffects_ApplyIncrementsGenerationWhenKeyIdentityChanges(t *testing.T) { + store := snapshot.NewStore() + vs := NewVersionStore(store, nil) + explorer := &Explorer{versionManager: vs} + + oldObj := &unstructured.Unstructured{} + oldObj.SetAPIVersion("v1") + oldObj.SetKind("ConfigMap") + oldObj.SetNamespace("default") + oldObj.SetName("example") + oldObj.SetGeneration(3) + oldObj.Object["spec"] = map[string]any{"message": "old"} + + oldKey := snapshot.NewCompositeKeyWithGroup("", "ConfigMap", "default", "example", "obj-old") + oldHash := vs.Publish(oldObj) + + newObj := oldObj.DeepCopy() + newObj.SetGeneration(0) + newObj.Object["spec"] = map[string]any{"message": "new"} + + newKey := snapshot.NewCompositeKeyWithGroup("", "ConfigMap", "default", "example", "obj-new") + newHash := vs.Publish(newObj) + + stepResult := &ReconcileResult{ + Changes: Changes{ + ObjectVersions: ObjectVersions{newKey: newHash}, + Effects: []Effect{ + { + OpType: event.APPLY, + Key: newKey, + Version: newHash, + }, + }, + }, + } + + state := StateNode{ + Contents: NewStateSnapshot( + ObjectVersions{oldKey: oldHash}, + KindSequences{oldKey.CanonicalGroupKind(): 1}, + nil, + ), + } + + nextState, _, _ := explorer.applyEffects(logr.Discard(), state, stepResult) + + _, oldStillPresent := nextState[oldKey] + require.False(t, oldStillPresent, "expected old key to be replaced") + + updatedHash, ok := nextState[newKey] + require.True(t, ok, "expected applied object to exist under new key") + updated := vs.Resolve(updatedHash) + require.NotNil(t, updated, "expected applied object to resolve") + require.Equal(t, int64(4), updated.GetGeneration(), "expected generation increment from old object") + require.Equal(t, "new", updated.Object["spec"].(map[string]any)["message"]) +} diff --git a/pkg/tracecheck/state.go b/pkg/tracecheck/state.go index 1fed68d..b910912 100644 --- a/pkg/tracecheck/state.go +++ b/pkg/tracecheck/state.go @@ -113,7 +113,7 @@ func (eh ExecutionHistory) UniqueKey() string { if len(eh) > 0 { lastStep := eh[len(eh)-1] originalConverged = len(lastStep.PendingReconciles) == 0 || - allPendingIgnorableForConvergence(lastStep.PendingReconciles) + AllPendingIgnorableForConvergence(lastStep.PendingReconciles) } // Filter out no-ops (steps with no changes and no errors) diff --git a/pkg/tracecheck/trigger.go b/pkg/tracecheck/trigger.go index 210cf09..70d8fbf 100644 --- a/pkg/tracecheck/trigger.go +++ b/pkg/tracecheck/trigger.go @@ -93,7 +93,7 @@ func (pr PendingReconcile) String() string { return fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) } -// allPendingIgnorableForConvergence returns true if all pending reconciles are from +// AllPendingIgnorableForConvergence returns true if all pending reconciles are from // sources that don't indicate state changes (async enqueues from tickers, or requeues // from controllers that always re-enqueue). This is used to determine convergence: // if the only remaining work is time-based re-enqueues or poll-based requeues, @@ -101,7 +101,7 @@ func (pr PendingReconcile) String() string { // // IMPORTANT: Returns false if ANY pending has SourceStateChange, which means // the state should NOT be considered converged. -func allPendingIgnorableForConvergence(pending []PendingReconcile) bool { +func AllPendingIgnorableForConvergence(pending []PendingReconcile) bool { if len(pending) == 0 { return false // empty list should not be considered "all ignorable" } diff --git a/pkg/util/object_generation.go b/pkg/util/object_generation.go new file mode 100644 index 0000000..45ce9ef --- /dev/null +++ b/pkg/util/object_generation.go @@ -0,0 +1,55 @@ +package util + +import ( + "encoding/json" + "strconv" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +// GetObjectGeneration returns an object's generation. +// +// It prioritizes the generated accessor and falls back to raw metadata fields to +// handle test fixtures where generation may be represented as a float or string. +func GetObjectGeneration(obj *unstructured.Unstructured) int64 { + if obj == nil { + return 0 + } + if gen := obj.GetGeneration(); gen != 0 { + return gen + } + + raw, found, err := unstructured.NestedFieldNoCopy(obj.Object, "metadata", "generation") + if err != nil || !found || raw == nil { + return 0 + } + + switch v := raw.(type) { + case int64: + return v + case int32: + return int64(v) + case int: + return int64(v) + case float64: + return int64(v) + case float32: + return int64(v) + case json.Number: + if i, err := v.Int64(); err == nil { + return i + } + if f, err := v.Float64(); err == nil { + return int64(f) + } + case string: + if i, err := strconv.ParseInt(v, 10, 64); err == nil { + return i + } + if f, err := strconv.ParseFloat(v, 64); err == nil { + return int64(f) + } + } + + return 0 +} From a0812733ff1c71ad73ecefef6e7fede14370f9b9 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:34:40 -0800 Subject: [PATCH 13/29] fix broken test --- pkg/tracecheck/explore.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go index 9411fc0..144a233 100644 --- a/pkg/tracecheck/explore.go +++ b/pkg/tracecheck/explore.go @@ -1372,12 +1372,13 @@ func (e *Explorer) applyEffects(stepLogger logr.Logger, stateView StateNode, ste break } + // Capture the existing version before any key replacement. + oldVersion := nextState[existingKey] if exists && existingKey != effect.Key { delete(nextState, existingKey) } // Mimic APIServer behavior: increment Generation on spec updates (not status-only updates) - oldVersion := nextState[existingKey] oldObj := e.versionManager.Resolve(oldVersion) newObj := e.versionManager.Resolve(changes[effect.Key]) if oldObj != nil && newObj != nil { From b7f498230f6e90fcda7cc16266be67c7ee2a93be Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:39:27 -0800 Subject: [PATCH 14/29] fix TUI display logic bug --- pkg/interactive/inspector_tview.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pkg/interactive/inspector_tview.go b/pkg/interactive/inspector_tview.go index ae647c9..eb8f16b 100644 --- a/pkg/interactive/inspector_tview.go +++ b/pkg/interactive/inspector_tview.go @@ -2041,18 +2041,16 @@ func formatPathSummary(state tracecheck.ResultState, pathIdx int) string { b.WriteString("\nPending Reconciles:\n") for idx, pr := range state.State.PendingReconciles { req := pr.Request.NamespacedName - fmt.Fprintf(&b, " [%d] %s %s/%s\n", idx, pr.ReconcilerID, req.Namespace, req.Name) + fmt.Fprintf(&b, " [%d] %s %s/%s (%s)\n", idx, pr.ReconcilerID, req.Namespace, req.Name, pr.Source) } } b.WriteString("\nOutcome:\n") - if len(state.State.PendingReconciles) == 0 && state.Error == nil { + if state.State.IsConverged() && state.Error == nil { b.WriteString(" Converged\n") } else { b.WriteString(" Aborted\n") - if state.Error != nil { - fmt.Fprintf(&b, " Error: %s\n", state.Error.Error()) - } + fmt.Fprintf(&b, " Error: %s\n", state.Error.Error()) } return b.String() } From 01fe607114621518cc4fb525eed3b1276e8e48cf Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:40:45 -0800 Subject: [PATCH 15/29] revert method promotion --- pkg/tracecheck/explore.go | 4 ++-- pkg/tracecheck/state.go | 4 ++-- pkg/tracecheck/trigger.go | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go index 144a233..0e26675 100644 --- a/pkg/tracecheck/explore.go +++ b/pkg/tracecheck/explore.go @@ -148,7 +148,7 @@ func (e *Explorer) shouldApplyNextUserAction(state StateNode) bool { return false } // policy here is to apply the next user action after the current state has converged - return len(state.PendingReconciles) == 0 || AllPendingIgnorableForConvergence(state.PendingReconciles) + return len(state.PendingReconciles) == 0 || allPendingIgnorableForConvergence(state.PendingReconciles) } // Objects resolves and returns all objects for the provided ResultState, skipping any that cannot be resolved. @@ -809,7 +809,7 @@ func (e *Explorer) explore( // NOTE: If a state has ANY SourceStateChange pending reconciles, it should NOT be // considered converged. The allPendingIgnorableForConvergence function returns false // if any pending has SourceStateChange. - if len(currentState.PendingReconciles) == 0 || AllPendingIgnorableForConvergence(currentState.PendingReconciles) { + if len(currentState.PendingReconciles) == 0 || allPendingIgnorableForConvergence(currentState.PendingReconciles) { convergenceKey := currentState.ConvergenceHash() reason := "no pending reconciles" if len(currentState.PendingReconciles) > 0 { diff --git a/pkg/tracecheck/state.go b/pkg/tracecheck/state.go index b910912..db8eb60 100644 --- a/pkg/tracecheck/state.go +++ b/pkg/tracecheck/state.go @@ -113,7 +113,7 @@ func (eh ExecutionHistory) UniqueKey() string { if len(eh) > 0 { lastStep := eh[len(eh)-1] originalConverged = len(lastStep.PendingReconciles) == 0 || - AllPendingIgnorableForConvergence(lastStep.PendingReconciles) + allPendingIgnorableForConvergence(lastStep.PendingReconciles) } // Filter out no-ops (steps with no changes and no errors) @@ -270,7 +270,7 @@ func (sn StateNode) DumpPending() { } func (sn StateNode) IsConverged() bool { - return len(sn.PendingReconciles) == 0 + return len(sn.PendingReconciles) == 0 || allPendingIgnorableForConvergence(sn.PendingReconciles) } func (sn StateNode) Objects() ObjectVersions { diff --git a/pkg/tracecheck/trigger.go b/pkg/tracecheck/trigger.go index 70d8fbf..210cf09 100644 --- a/pkg/tracecheck/trigger.go +++ b/pkg/tracecheck/trigger.go @@ -93,7 +93,7 @@ func (pr PendingReconcile) String() string { return fmt.Sprintf("%s:%s/%s", pr.ReconcilerID, pr.Request.Namespace, pr.Request.Name) } -// AllPendingIgnorableForConvergence returns true if all pending reconciles are from +// allPendingIgnorableForConvergence returns true if all pending reconciles are from // sources that don't indicate state changes (async enqueues from tickers, or requeues // from controllers that always re-enqueue). This is used to determine convergence: // if the only remaining work is time-based re-enqueues or poll-based requeues, @@ -101,7 +101,7 @@ func (pr PendingReconcile) String() string { // // IMPORTANT: Returns false if ANY pending has SourceStateChange, which means // the state should NOT be considered converged. -func AllPendingIgnorableForConvergence(pending []PendingReconcile) bool { +func allPendingIgnorableForConvergence(pending []PendingReconcile) bool { if len(pending) == 0 { return false // empty list should not be considered "all ignorable" } From 49f931de2deb1e871a607bddc72a8ca8c7e97cb5 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:41:58 -0800 Subject: [PATCH 16/29] add flag to disable the analyze+perturb+rerun phase so users can just run plain simulations --- examples/knative-serving/main.go | 9 ++--- pkg/explore/flags.go | 6 ++++ pkg/explore/flags_test.go | 6 ++++ pkg/explore/parallel_runner.go | 2 +- pkg/explore/parallel_runner_test.go | 55 +++++++++++++++++++++++++++++ 5 files changed, 70 insertions(+), 8 deletions(-) diff --git a/examples/knative-serving/main.go b/examples/knative-serving/main.go index 22eba4d..e678082 100644 --- a/examples/knative-serving/main.go +++ b/examples/knative-serving/main.go @@ -178,15 +178,10 @@ func main() { if explore.PerturbEnabled() { fmt.Fprintln(os.Stderr, "closed-loop scaffold: running per-input reference->rerun pipelines") } else { - fmt.Fprintln(os.Stderr, "single-pass scaffold: running per-input reference-only simulation") + fmt.Fprintln(os.Stderr, "closed-loop scaffold: running per-input reference-only simulation (--perturb=false)") } - builderFn := scenariosFromInputsWithClosedLoop - if !explore.PerturbEnabled() { - builderFn = scenariosFromInputs - } - - scenarios, err := builderFn(builder, inputs) + scenarios, err := scenariosFromInputsWithClosedLoop(builder, inputs) if err != nil { fmt.Fprintf(os.Stderr, "build scenarios: %v\n", err) os.Exit(1) diff --git a/pkg/explore/flags.go b/pkg/explore/flags.go index 1abfa12..e0b6573 100644 --- a/pkg/explore/flags.go +++ b/pkg/explore/flags.go @@ -8,6 +8,7 @@ var ( configPathFlag = flag.String("explore-config", "", "optional JSON file to configure exploration") dumpStatsPath = flag.String("dump-stats", "", "optional path to write exploration stats (JSON)") inputsPathFlag = flag.String("inputs", "", `path to input JSON file`) + perturbFlag = flag.Bool("perturb", true, "enable closed-loop rerun pipeline for batch inputs when supported by scenario generation") parallelProcessesFlag = flag.Bool("parallel-processes", false, "run batch mode using process-isolated child executions") parallelChildIndexFlag = flag.Int( "parallel-child-index", @@ -41,6 +42,11 @@ func InputsPath() string { return *inputsPathFlag } +// PerturbEnabled returns the parsed value for the perturb flag. +func PerturbEnabled() bool { + return *perturbFlag +} + // ParallelProcessesEnabled reports whether process-isolated parallel mode is enabled. func ParallelProcessesEnabled() bool { return *parallelProcessesFlag diff --git a/pkg/explore/flags_test.go b/pkg/explore/flags_test.go index dcbebf4..2dc6acf 100644 --- a/pkg/explore/flags_test.go +++ b/pkg/explore/flags_test.go @@ -8,6 +8,12 @@ func TestInputsPathDefault(t *testing.T) { } } +func TestPerturbEnabledDefault(t *testing.T) { + if !PerturbEnabled() { + t.Fatalf("expected perturb enabled by default") + } +} + func TestParallelProcessesDefault(t *testing.T) { if ParallelProcessesEnabled() { t.Fatalf("expected parallel-processes to be disabled by default") diff --git a/pkg/explore/parallel_runner.go b/pkg/explore/parallel_runner.go index b3fffea..a1bc97b 100644 --- a/pkg/explore/parallel_runner.go +++ b/pkg/explore/parallel_runner.go @@ -330,7 +330,7 @@ func (r *ParallelRunner) runScenario(ctx context.Context, scenario Scenario, opt return result } - if scenario.ClosedLoop == nil { + if scenario.ClosedLoop == nil || !PerturbEnabled() { phase := r.runScenarioPhase(ctx, scenario, opts, idx, "", scenario.Config, seed, nil, nil, scenario.Context) result.Phases = []ScenarioPhaseResult{phase} applyPhaseSummary(&result, phase) diff --git a/pkg/explore/parallel_runner_test.go b/pkg/explore/parallel_runner_test.go index 2753b06..cdba091 100644 --- a/pkg/explore/parallel_runner_test.go +++ b/pkg/explore/parallel_runner_test.go @@ -259,6 +259,61 @@ func TestParallelRunnerClosedLoopRunsReferenceThenRerunPerScenario(t *testing.T) } } +func TestParallelRunnerClosedLoopDisablesRerunWhenPerturbDisabled(t *testing.T) { + ctx := context.Background() + builder, state := newTestBuilder(t) + + runner, err := NewParallelRunner(builder) + if err != nil { + t.Fatalf("new runner: %v", err) + } + + withPerturbFlag(t, false) + plannerCalls := 0 + scenarios := []Scenario{ + { + Name: "closed-loop-disabled", + EnvironmentState: state.Clone(), + Config: tracecheck.ExploreConfig{MaxDepth: 1}, + ClosedLoop: &ClosedLoopSpec{ + Plan: func(reference ScenarioPhaseResult) ([]ScenarioPhasePlan, error) { + plannerCalls++ + return []ScenarioPhasePlan{ + {Name: "rerun", Config: tracecheck.ExploreConfig{MaxDepth: 5}}, + }, nil + }, + }, + }, + } + + results, err := runner.RunAll(ctx, scenarios, ParallelOptions{MaxParallel: 1}) + if err != nil { + t.Fatalf("run all: %v", err) + } + if plannerCalls != 0 { + t.Fatalf("expected planner to be skipped when perturb is disabled, got %d calls", plannerCalls) + } + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if len(results[0].Phases) != 1 { + t.Fatalf("expected only one phase when perturb disabled, got %d", len(results[0].Phases)) + } + if results[0].Phases[0].Name != "run" { + t.Fatalf("expected single phase name to be run, got %q", results[0].Phases[0].Name) + } +} + +func withPerturbFlag(t *testing.T, enabled bool) { + t.Helper() + + oldValue := *perturbFlag + *perturbFlag = enabled + t.Cleanup(func() { + *perturbFlag = oldValue + }) +} + func TestParallelRunnerClosedLoopWritesPhaseDumps(t *testing.T) { ctx := context.Background() builder, state := newTestBuilder(t) From a096df8b72822e2aac8420d83aed98c33da73f8e Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:42:19 -0800 Subject: [PATCH 17/29] drop dead code --- pkg/interactive/inspector_tview.go | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/pkg/interactive/inspector_tview.go b/pkg/interactive/inspector_tview.go index eb8f16b..8a5ae7b 100644 --- a/pkg/interactive/inspector_tview.go +++ b/pkg/interactive/inspector_tview.go @@ -2069,25 +2069,6 @@ func truncateString(s string, max int) string { return string(runes[:max-3]) + "..." } -func formatObjectVersions(objects tracecheck.ObjectVersions, indent string) string { - if len(objects) == 0 { - return indent + "(none)\n" - } - keys := make([]snapshot.CompositeKey, 0, len(objects)) - for key := range objects { - keys = append(keys, key) - } - sort.Slice(keys, func(i, j int) bool { - return keys[i].String() < keys[j].String() - }) - - var b strings.Builder - for _, key := range keys { - fmt.Fprintf(&b, "%s%s => %s\n", indent, key.String(), objects[key].Value) - } - return b.String() -} - func formatResolverUnavailable(hash snapshot.VersionHash) string { return fmt.Sprintf("object content unavailable; strategy=%s hash=%s (%s)", hash.Strategy, util.ShortenHash(hash.Value), hash.Value) } From dafcae6c882407bf68d11ef4e406cb4c944bc334 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:48:55 -0800 Subject: [PATCH 18/29] API compatibility changes for existing projects --- examples/crossplane/main.go | 2 +- examples/karpenter/inputs.json | 287 +++++++++++++--------------- examples/karpenter/main.go | 2 +- examples/karpenter/scenario.go | 215 ++++++++++++--------- examples/karpenter/scenario_test.go | 41 +--- examples/kratix/main.go | 2 +- 6 files changed, 274 insertions(+), 275 deletions(-) diff --git a/examples/crossplane/main.go b/examples/crossplane/main.go index 0402c25..298fc77 100644 --- a/examples/crossplane/main.go +++ b/examples/crossplane/main.go @@ -57,7 +57,7 @@ func main() { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) os.Exit(1) } - if err := runner.Run(ctx, initialState); err != nil { + if err := runner.Run(ctx, explore.RunInput{EnvironmentState: initialState}); err != nil { fmt.Fprintf(os.Stderr, "session error: %v\n", err) os.Exit(1) } diff --git a/examples/karpenter/inputs.json b/examples/karpenter/inputs.json index a051a50..b901b13 100644 --- a/examples/karpenter/inputs.json +++ b/examples/karpenter/inputs.json @@ -1,172 +1,157 @@ [ { "name": "karpenter-default", - "objects": [ + "environmentState": { + "objects": [] + }, + "tuning": { + "maxDepth": 0, + "permuteControllers": null, + "staleReads": null, + "staleLookback": null + }, + "userInputs": [ { - "apiVersion": "karpenter.test.sh/v1alpha1", - "kind": "TestNodeClass", - "metadata": { - "creationTimestamp": "2026-02-18T04:02:02Z", - "generation": 1, - "labels": { - "discrete.events/sleeve-object-id": "2cxsqzm7", - "testing/cluster": "unspecified" + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "karpenter.test.sh/v1alpha1", + "kind": "TestNodeClass", + "metadata": { + "creationTimestamp": "2026-02-18T04:02:02Z", + "generation": 1, + "labels": { + "discrete.events/sleeve-object-id": "2cxsqzm7", + "testing/cluster": "unspecified" + }, + "name": "default", + "uid": "testnodeclass-uid" }, - "name": "default", - "uid": "testnodeclass-uid" - }, - "spec": {}, - "status": { - "conditions": [ - { - "lastTransitionTime": "2026-02-18T04:02:02Z", - "message": "", - "reason": "Ready", - "status": "True", - "type": "Ready" - } - ] + "spec": {}, + "status": { + "conditions": [ + { + "lastTransitionTime": "2026-02-18T04:02:02Z", + "message": "", + "reason": "Ready", + "status": "True", + "type": "Ready" + } + ] + } } }, { - "apiVersion": "karpenter.sh/v1", - "kind": "NodePool", - "metadata": { - "creationTimestamp": "2026-02-18T04:02:02Z", - "generation": 1, - "labels": { - "discrete.events/sleeve-object-id": "1jdj3czp", - "testing/cluster": "unspecified" - }, - "name": "default", - "uid": "nodepool-uid" - }, - "spec": { - "disruption": { - "consolidateAfter": "Never" - }, - "limits": { - "cpu": "2k" + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "karpenter.sh/v1", + "kind": "NodePool", + "metadata": { + "creationTimestamp": "2026-02-18T04:02:02Z", + "generation": 1, + "labels": { + "discrete.events/sleeve-object-id": "1jdj3czp", + "testing/cluster": "unspecified" + }, + "name": "default", + "uid": "nodepool-uid" }, - "template": { - "metadata": { - "labels": { - "testing/cluster": "unspecified" - } + "spec": { + "disruption": { + "consolidateAfter": "Never" + }, + "limits": { + "cpu": "2k" }, - "spec": { - "expireAfter": "Never", - "nodeClassRef": { - "group": "karpenter.test.sh", - "kind": "TestNodeClass", - "name": "default" + "template": { + "metadata": { + "labels": { + "testing/cluster": "unspecified" + } }, - "requirements": [] + "spec": { + "expireAfter": "Never", + "nodeClassRef": { + "group": "karpenter.test.sh", + "kind": "TestNodeClass", + "name": "default" + }, + "requirements": [] + } } + }, + "status": { + "conditions": [ + { + "lastTransitionTime": "2026-02-18T04:02:02Z", + "message": "", + "reason": "ValidationSucceeded", + "status": "True", + "type": "ValidationSucceeded" + }, + { + "lastTransitionTime": "2026-02-18T04:02:02Z", + "message": "", + "reason": "NodeClassReady", + "status": "True", + "type": "NodeClassReady" + }, + { + "lastTransitionTime": "2026-02-18T04:02:02Z", + "message": "object is awaiting reconciliation", + "reason": "AwaitingReconciliation", + "status": "Unknown", + "type": "NodeRegistrationHealthy" + }, + { + "lastTransitionTime": "2026-02-18T04:02:02Z", + "message": "", + "reason": "Ready", + "status": "True", + "type": "Ready" + } + ], + "nodes": null } - }, - "status": { - "conditions": [ - { - "lastTransitionTime": "2026-02-18T04:02:02Z", - "message": "", - "reason": "ValidationSucceeded", - "status": "True", - "type": "ValidationSucceeded" - }, - { - "lastTransitionTime": "2026-02-18T04:02:02Z", - "message": "", - "reason": "NodeClassReady", - "status": "True", - "type": "NodeClassReady" - }, - { - "lastTransitionTime": "2026-02-18T04:02:02Z", - "message": "object is awaiting reconciliation", - "reason": "AwaitingReconciliation", - "status": "Unknown", - "type": "NodeRegistrationHealthy" - }, - { - "lastTransitionTime": "2026-02-18T04:02:02Z", - "message": "", - "reason": "Ready", - "status": "True", - "type": "Ready" - } - ], - "nodes": null } }, { - "apiVersion": "v1", - "kind": "Pod", - "metadata": { - "labels": { - "discrete.events/sleeve-object-id": "rygyqven" + "id": "", + "type": "CREATE", + "object": { + "apiVersion": "v1", + "kind": "Pod", + "metadata": { + "labels": { + "discrete.events/sleeve-object-id": "rygyqven" + }, + "name": "pending", + "namespace": "default", + "uid": "pod-uid" }, - "name": "pending", - "namespace": "default", - "uid": "pod-uid" - }, - "spec": { - "containers": [ - { - "image": "pause", - "name": "c", - "resources": {} - } - ] - }, - "status": { - "conditions": [ - { - "lastProbeTime": null, - "lastTransitionTime": null, - "reason": "Unschedulable", - "status": "False", - "type": "PodScheduled" - } - ] - } - } - ], - "pending": [ - { - "controllerId": "state.pod", - "key": { - "namespace": "default", - "name": "pending" - } - }, - { - "controllerId": "provisioner.trigger.pod", - "key": { - "namespace": "default", - "name": "pending" - } - }, - { - "controllerId": "provisioner", - "key": { - "namespace": "", - "name": "singleton" - } - }, - { - "controllerId": "state.nodepool", - "key": { - "namespace": "", - "name": "default" + "spec": { + "containers": [ + { + "image": "pause", + "name": "c", + "resources": {} + } + ] + }, + "status": { + "conditions": [ + { + "lastProbeTime": null, + "lastTransitionTime": null, + "reason": "Unschedulable", + "status": "False", + "type": "PodScheduled" + } + ] + } } } - ], - "tuning": { - "maxDepth": 0, - "permuteControllers": null, - "staleReads": null, - "staleLookback": null - } + ] } ] diff --git a/examples/karpenter/main.go b/examples/karpenter/main.go index ea9b45a..52ddb00 100644 --- a/examples/karpenter/main.go +++ b/examples/karpenter/main.go @@ -60,7 +60,7 @@ func main() { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) os.Exit(1) } - if err := runner.Run(ctx, initialState); err != nil { + if err := runner.Run(ctx, explore.RunInput{EnvironmentState: initialState}); err != nil { fmt.Fprintf(os.Stderr, "session error: %v\n", err) os.Exit(1) } diff --git a/examples/karpenter/scenario.go b/examples/karpenter/scenario.go index 76c37d6..88352a8 100644 --- a/examples/karpenter/scenario.go +++ b/examples/karpenter/scenario.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/tgoodwin/kamera/pkg/coverage" + "github.com/tgoodwin/kamera/pkg/event" "github.com/tgoodwin/kamera/pkg/explore" "github.com/tgoodwin/kamera/pkg/tag" "github.com/tgoodwin/kamera/pkg/tracecheck" @@ -18,7 +19,6 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/test" @@ -85,15 +85,20 @@ func scenariosFromInputs(builder *tracecheck.ExplorerBuilder, inputs []coverage. } for _, variant := range variants { - state, err := buildStateFromCoverageInput(builder, variant) + state, seededObjects, err := buildStateFromCoverageInput(builder, variant) if err != nil { return nil, fmt.Errorf("build start state for %s: %w", variant.Name, err) } + userActions, err := buildUserActionsFromCoverageInput(variant, seededObjects) + if err != nil { + return nil, fmt.Errorf("build user actions for %s: %w", variant.Name, err) + } scenarios = append(scenarios, explore.Scenario{ - Name: variant.Name, - InitialState: state, - Config: applyInputTuning(baseCfg, variant.Tuning), + Name: variant.Name, + EnvironmentState: state, + UserInputs: userActions, + Config: applyInputTuning(baseCfg, variant.Tuning), }) } } @@ -123,17 +128,17 @@ func expandKarpenterParameterizedInput(input coverage.Input, fuzzCases int, fuzz base := cloneCoverageInput(input) base.Name = baseName + "/base" - podIdx := findKarpenterPod(base.Objects) - nodePoolIdx := findKarpenterNodePool(base.Objects) + podIdx := findKarpenterPodInUserInputs(base.UserInputs) + nodePoolIdx := findKarpenterNodePoolInUserInputs(base.UserInputs) if podIdx < 0 || nodePoolIdx < 0 { return []coverage.Input{base}, nil } - templatePod, err := unstructuredToPod(base.Objects[podIdx]) + templatePod, err := unstructuredToPod(base.UserInputs[podIdx].Object) if err != nil { return nil, err } - templateNodePool, err := unstructuredToNodePool(base.Objects[nodePoolIdx]) + templateNodePool, err := unstructuredToNodePool(base.UserInputs[nodePoolIdx].Object) if err != nil { return nil, err } @@ -280,8 +285,8 @@ func buildKarpenterVariantInput(input coverage.Input, podIdx int, nodePoolIdx in if err != nil { return coverage.Input{}, fmt.Errorf("convert nodepool for %q: %w", name, err) } - updated.Objects[podIdx] = podObj - updated.Objects[nodePoolIdx] = nodePoolObj + updated.UserInputs[podIdx].Object = podObj + updated.UserInputs[nodePoolIdx].Object = nodePoolObj return updated, nil } @@ -347,88 +352,35 @@ func karpenterSelectionKey(selection map[int]int) string { return strings.Join(parts, ",") } -func buildStateFromCoverageInput(builder *tracecheck.ExplorerBuilder, input coverage.Input) (tracecheck.StateNode, error) { +func buildStateFromCoverageInput(builder *tracecheck.ExplorerBuilder, input coverage.Input) (tracecheck.StateNode, []client.Object, error) { if builder == nil { - return tracecheck.StateNode{}, fmt.Errorf("builder is nil") + return tracecheck.StateNode{}, nil, fmt.Errorf("builder is nil") } - objects := make([]client.Object, 0, len(input.Objects)) - for _, obj := range input.Objects { + objects := make([]client.Object, 0, len(input.EnvironmentState.Objects)) + for _, obj := range input.EnvironmentState.Objects { if obj == nil { continue } objects = append(objects, obj.DeepCopy()) } if len(objects) == 0 { - return tracecheck.StateNode{}, fmt.Errorf("input has no objects") - } - - pending := make([]tracecheck.PendingReconcile, 0, len(input.Pending)) - for _, p := range input.Pending { - pending = append(pending, tracecheck.PendingReconcile{ - ReconcilerID: tracecheck.ReconcilerID(p.ControllerID), - Request: reconcile.Request{ - NamespacedName: client.ObjectKey{ - Namespace: p.Key.Namespace, - Name: p.Key.Name, - }, - }, - Source: tracecheck.SourceStateChange, - }) - } - if len(pending) == 0 { - pending = defaultKarpenterPending(objects) + fallback, err := newScenarioObjects() + if err != nil { + return tracecheck.StateNode{}, nil, err + } + objects = append(objects, fallback...) } - if len(pending) == 0 { - return tracecheck.StateNode{}, fmt.Errorf("input has no pending reconciles") + if len(objects) == 0 { + return tracecheck.StateNode{}, nil, fmt.Errorf("input has no objects") } - return builder.BuildStartStateFromObjects(objects, pending) -} - -func defaultKarpenterPending(objects []client.Object) []tracecheck.PendingReconcile { pending := make([]tracecheck.PendingReconcile, 0) - hasPod := false - for _, obj := range objects { - if isKarpenterPod(obj) { - hasPod = true - pending = append(pending, - tracecheck.PendingReconcile{ - ReconcilerID: "state.pod", - Request: reconcile.Request{ - NamespacedName: client.ObjectKey{Namespace: obj.GetNamespace(), Name: obj.GetName()}, - }, - Source: tracecheck.SourceStateChange, - }, - tracecheck.PendingReconcile{ - ReconcilerID: "provisioner.trigger.pod", - Request: reconcile.Request{ - NamespacedName: client.ObjectKey{Namespace: obj.GetNamespace(), Name: obj.GetName()}, - }, - Source: tracecheck.SourceStateChange, - }, - ) - } - if isKarpenterNodePool(obj) { - pending = append(pending, tracecheck.PendingReconcile{ - ReconcilerID: "state.nodepool", - Request: reconcile.Request{ - NamespacedName: client.ObjectKey{Namespace: obj.GetNamespace(), Name: obj.GetName()}, - }, - Source: tracecheck.SourceStateChange, - }) - } - } - if hasPod { - pending = append(pending, tracecheck.PendingReconcile{ - ReconcilerID: "provisioner", - Request: reconcile.Request{ - NamespacedName: client.ObjectKey{Name: "singleton"}, - }, - Source: tracecheck.SourceStateChange, - }) + state, err := builder.BuildStartStateFromObjects(objects, pending) + if err != nil { + return tracecheck.StateNode{}, nil, err } - return pending + return state, objects, nil } func applyInputTuning(base tracecheck.ExploreConfig, tuning coverage.InputTuning) tracecheck.ExploreConfig { @@ -448,8 +400,8 @@ func applyInputTuning(base tracecheck.ExploreConfig, tuning coverage.InputTuning } func cloneCoverageInput(input coverage.Input) coverage.Input { - objects := make([]*unstructured.Unstructured, 0, len(input.Objects)) - for _, obj := range input.Objects { + objects := make([]*unstructured.Unstructured, 0, len(input.EnvironmentState.Objects)) + for _, obj := range input.EnvironmentState.Objects { if obj == nil { objects = append(objects, nil) continue @@ -457,7 +409,7 @@ func cloneCoverageInput(input coverage.Input) coverage.Input { objects = append(objects, obj.DeepCopy()) } - pending := append([]coverage.Pending(nil), input.Pending...) + userInputs := cloneUserInputs(input.UserInputs) tuning := coverage.InputTuning{ MaxDepth: input.Tuning.MaxDepth, PermuteControllers: append([]string(nil), input.Tuning.PermuteControllers...), @@ -465,11 +417,78 @@ func cloneCoverageInput(input coverage.Input) coverage.Input { StaleLookback: cloneIntMap(input.Tuning.StaleLookback), } return coverage.Input{ - Name: input.Name, - Objects: objects, - Pending: pending, - Tuning: tuning, + Name: input.Name, + EnvironmentState: coverage.EnvironmentState{Objects: objects}, + UserInputs: userInputs, + Tuning: tuning, + } +} + +func buildUserActionsFromCoverageInput(input coverage.Input, seededObjects []client.Object) ([]tracecheck.UserAction, error) { + actions := make([]tracecheck.UserAction, 0, len(input.UserInputs)) + for idx, userInput := range input.UserInputs { + if userInput.Object == nil { + return nil, fmt.Errorf("user input %d (%s) missing object", idx, input.Name) + } + opType := userInput.Type + if opType == event.CREATE && isInputObjectSeeded(userInput.Object, seededObjects) { + opType = event.UPDATE + } + actions = append(actions, tracecheck.UserAction{ + ID: strings.TrimSpace(userInput.ID), + OpType: opType, + Payload: userInput.Object.DeepCopy(), + }) + } + for idx := range actions { + if strings.TrimSpace(actions[idx].ID) == "" { + actions[idx].ID = fmt.Sprintf("user-input-%d", idx) + } + } + return actions, nil +} + +func isInputObjectSeeded(object client.Object, seededObjects []client.Object) bool { + if object == nil { + return false } + for _, seeded := range seededObjects { + if sameObjectIdentity(seeded, object) { + return true + } + } + return false +} + +func sameObjectIdentity(a, b client.Object) bool { + if a == nil || b == nil { + return false + } + aGVK := a.GetObjectKind().GroupVersionKind() + bGVK := b.GetObjectKind().GroupVersionKind() + if aGVK.Group != bGVK.Group || aGVK.Kind != bGVK.Kind { + return false + } + return a.GetNamespace() == b.GetNamespace() && a.GetName() == b.GetName() +} + +func cloneUserInputs(inputs []coverage.UserInput) []coverage.UserInput { + if len(inputs) == 0 { + return nil + } + + out := make([]coverage.UserInput, 0, len(inputs)) + for _, input := range inputs { + cloned := coverage.UserInput{ + ID: input.ID, + Type: input.Type, + } + if input.Object != nil { + cloned.Object = input.Object.DeepCopy() + } + out = append(out, cloned) + } + return out } func cloneStringSliceMap(in map[string][]string) map[string][]string { @@ -506,6 +525,30 @@ func findKarpenterPod(objects []*unstructured.Unstructured) int { return -1 } +func findKarpenterPodInUserInputs(userInputs []coverage.UserInput) int { + for idx, input := range userInputs { + if input.Object == nil { + continue + } + if isKarpenterPod(input.Object) { + return idx + } + } + return -1 +} + +func findKarpenterNodePoolInUserInputs(userInputs []coverage.UserInput) int { + for idx, input := range userInputs { + if input.Object == nil { + continue + } + if isKarpenterNodePool(input.Object) { + return idx + } + } + return -1 +} + func findKarpenterNodePool(objects []*unstructured.Unstructured) int { for idx, obj := range objects { if obj == nil { diff --git a/examples/karpenter/scenario_test.go b/examples/karpenter/scenario_test.go index b85427a..82eb2b8 100644 --- a/examples/karpenter/scenario_test.go +++ b/examples/karpenter/scenario_test.go @@ -39,12 +39,9 @@ func TestScenariosFromInputsGeneratesWorkflowVariants(t *testing.T) { seenNoFitVariant := false seenSampledVariant := false for _, sc := range scenarios { - if len(sc.InitialState.Objects()) == 0 { + if len(sc.EnvironmentState.Objects()) == 0 { t.Fatalf("scenario %q has empty state objects", sc.Name) } - if len(sc.InitialState.PendingReconciles) == 0 { - t.Fatalf("scenario %q has no pending reconciles", sc.Name) - } if strings.Contains(sc.Name, "/single/pod-") { seenPodVariant = true } @@ -102,11 +99,11 @@ func TestExpandKarpenterParameterizedInputAddsNoFitNodeSelectorVariant(t *testin if variant.Name != target { continue } - podIdx := findKarpenterPod(variant.Objects) + podIdx := findKarpenterPod(variant.EnvironmentState.Objects) if podIdx < 0 { t.Fatalf("variant %q missing pod object", target) } - pod, err := unstructuredToPod(variant.Objects[podIdx]) + pod, err := unstructuredToPod(variant.EnvironmentState.Objects[podIdx]) if err != nil { t.Fatalf("convert pod variant: %v", err) } @@ -136,35 +133,9 @@ func mustKarpenterInput(t *testing.T, name string) coverage.Input { } return coverage.Input{ - Name: name, - Objects: objects, - Pending: []coverage.Pending{ - { - ControllerID: "state.pod", - Key: coverage.NamespacedName{ - Namespace: "default", - Name: "pending", - }, - }, - { - ControllerID: "provisioner.trigger.pod", - Key: coverage.NamespacedName{ - Namespace: "default", - Name: "pending", - }, - }, - { - ControllerID: "provisioner", - Key: coverage.NamespacedName{ - Name: "singleton", - }, - }, - { - ControllerID: "state.nodepool", - Key: coverage.NamespacedName{ - Name: "default", - }, - }, + Name: name, + EnvironmentState: coverage.EnvironmentState{ + Objects: objects, }, } } diff --git a/examples/kratix/main.go b/examples/kratix/main.go index 1aeafce..9d358ca 100644 --- a/examples/kratix/main.go +++ b/examples/kratix/main.go @@ -77,7 +77,7 @@ func main() { os.Exit(1) } - if err := runner.Run(ctx, initialState); err != nil { + if err := runner.Run(ctx, explore.RunInput{EnvironmentState: initialState}); err != nil { fmt.Fprintf(os.Stderr, "session error: %v\n", err) os.Exit(1) } From 1c9c92bbd2d78894b716f762b4d11a0ab7ca4163 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:49:18 -0800 Subject: [PATCH 19/29] update docs --- README.md | 2 +- examples/knative-serving/AGENTS.md | 1 + examples/knative-serving/README.md | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ea35bae..f6299b8 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,7 @@ go run . } ``` - `Runner` honors the standard `-interactive` and `-dump-output` flags (see `pkg/explore/flags.go`) so you can disable the inspector or persist results when scripting. + `Runner` honors the standard `-interactive`, `-perturb`, and `-dump-output` flags (see `pkg/explore/flags.go`) so you can disable the inspector, skip closed-loop analysis reruns (`--perturb=false`), or persist results when scripting. That’s enough to start evaluating how your controllers interact across different interleavings. diff --git a/examples/knative-serving/AGENTS.md b/examples/knative-serving/AGENTS.md index 2a20e6e..62f7acf 100644 --- a/examples/knative-serving/AGENTS.md +++ b/examples/knative-serving/AGENTS.md @@ -7,6 +7,7 @@ This example drives Kamera’s `Explorer` against a minimal Knative Serving setu - `-timeout `: abort exploration after this duration. Accepts Go duration strings (e.g., `10s`, `2m`). Set to `0` to disable. - `-log-level `: `debug`, `info`, `warn`, or `error` (default: `info`). - `-interactive `: launch the TUI inspector (`true` by default). Set `-interactive=false` for headless runs. +- `-perturb `: controls batch input pipeline shape. `true` (default) runs reference+rerun phases for scenarios that define closed-loop plans; `false` forces a single pass. - `-dump-output `: write converged + aborted states to a file (works even when `-interactive=false`). - `-emit-stats`: record and print reconcile performance stats at the end. diff --git a/examples/knative-serving/README.md b/examples/knative-serving/README.md index c9c9f81..ac7ac5d 100644 --- a/examples/knative-serving/README.md +++ b/examples/knative-serving/README.md @@ -22,6 +22,7 @@ Behavior: - `--inputs ` enables batch mode, even if `--parallel` is not set. - `--parallel` with no `--inputs` loads the default `inputs-example.json`. - `--timeout` applies per input/scenario run, not as an overall batch timeout. +- `--perturb=false` runs only the first pass for each input (single-pass simulation). ```bash go run . \ @@ -38,5 +39,6 @@ go run . \ --dump-output /tmp/knative-dumps \ --depth 100 \ --timeout 60s \ + --perturb=false \ --dump-stats /tmp/knative-stats ``` From 1e1237a156d5debd1dbcd665742a324438803f6f Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Mon, 23 Feb 2026 18:57:38 -0800 Subject: [PATCH 20/29] remove explicit name from knative template spec --- examples/knative-serving/two-step-workflow.json | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/knative-serving/two-step-workflow.json b/examples/knative-serving/two-step-workflow.json index d3377b8..7f85b50 100644 --- a/examples/knative-serving/two-step-workflow.json +++ b/examples/knative-serving/two-step-workflow.json @@ -28,8 +28,7 @@ "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", "autoscaling.knative.dev/initial-scale": "1", "autoscaling.knative.dev/min-scale": "0" - }, - "name": "kamera-test" + } }, "spec": { "containers": [ @@ -61,8 +60,7 @@ "autoscaling.knative.dev/class": "kpa.autoscaling.knative.dev", "autoscaling.knative.dev/initial-scale": "2", "autoscaling.knative.dev/min-scale": "2" - }, - "name": "kamera-test" + } }, "spec": { "containers": [ @@ -79,4 +77,4 @@ } ] } -] \ No newline at end of file +] From f6c0273d73388fac8d467acc97ca098d319cb41c Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Tue, 24 Feb 2026 15:26:26 -0800 Subject: [PATCH 21/29] fix missing-object diff detection in converged-state analysis --- .beads/issues.jsonl | 5 +++-- pkg/analysis/diff.go | 29 +++++++++++++++-------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 4697825..3230f11 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -1,3 +1,4 @@ +{"id":"kamera-07e","title":"issue","description":"Debug Karpenter example\n","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-23T18:58:07.650966-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-23T18:58:31.451848-08:00"} {"id":"kamera-0cl","title":"audit Apply semantics","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-22T17:53:50.031508-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-22T17:53:50.031508-08:00"} {"id":"kamera-0lr","title":"Expand Karpenter Harness Input Workflows","description":"Apply a Knative-style multi-workflow input expansion to the Karpenter example harness, starting with small deterministic workflow variants.","status":"closed","priority":2,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T19:33:05.975494-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-17T19:47:52.010981-08:00","closed_at":"2026-02-17T19:47:52.010981-08:00","close_reason":"Closed"} {"id":"kamera-0lr.1","title":"Add pod constraint variants to Karpenter harness","description":"Implement workflow variants that mutate pod requests/selectors/affinity-like constraints to broaden provisioning paths while keeping scenarios deterministic.","status":"closed","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T19:33:16.273655-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-17T19:47:52.006845-08:00","closed_at":"2026-02-17T19:47:52.006845-08:00","close_reason":"Closed","dependencies":[{"issue_id":"kamera-0lr.1","depends_on_id":"kamera-0lr","type":"parent-child","created_at":"2026-02-17T19:33:16.274981-08:00","created_by":"Tim Goodwin"}]} @@ -21,13 +22,13 @@ {"id":"kamera-9gz.2","title":"Add UserAction/UserController types in pkg/tracecheck","description":"Add user workflow data model types in `pkg/tracecheck`:\n- `UserAction` with data-only fields (`id`, `type`, `payload`)\n- `UserController` attached to each `Explorer` with ordered action list and branch-index-based execution API\n\n`UserController` should have an internal list of UserActions and each time an action is taken, we apply one of these actions. The apply strategy should use the `replay.Client`\ninfrastructure, similar to how controller-runtime controllers in kamera make\nAPI writes through replay.Client. That way, we get effect tracking for free.\n","acceptance_criteria":"- `UserAction` is data-only and compiles without function/closure fields.\n- `Explorer` owns exactly one `UserController` instance for the run.\n- `UserController` API supports \"execute next action by branch index\" and returns normal change/effect result shape.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:31:51.464481-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T17:20:44.121883-08:00","closed_at":"2026-02-20T17:20:44.121883-08:00","close_reason":"Implemented UserAction/UserController model and Explorer wiring with tracecheck tests","dependencies":[{"issue_id":"kamera-9gz.2","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:31:51.466041-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-9gz.3","title":"Add branch-local nextUserActionIdx state plumbing","description":"Extend `StateNode` with branch-local user workflow progress (`nextUserActionIdx`) and wire clone/copy/plumbing so branch state evolves independently.\n","acceptance_criteria":"- `StateNode` includes `nextUserActionIdx int` with zero default.\n- All clone/copy/fork paths preserve and isolate this field per branch.\n- Tests verify incrementing on one branch does not mutate sibling branch progress.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:01.805306-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T17:23:40.784199-08:00","closed_at":"2026-02-20T17:23:40.784199-08:00","close_reason":"Added branch-local nextUserActionIdx state plumbing with clone/copy preservation tests","dependencies":[{"issue_id":"kamera-9gz.3","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:01.806828-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-9gz.4","title":"Add shouldApplyNextUserAction scheduler abstraction","description":"Introduce scheduler abstraction in `Explorer`:\n- `shouldApplyNextUserAction(state StateNode) bool`\n- initial internal policy: apply when branch is quiescent\n\nKeep outer explore loop stable so future scheduling policies can plug in.\n","acceptance_criteria":"- `shouldApplyNextUserAction` exists and is called from the explore step loop.\n- Quiescence-only v1 policy is encapsulated behind this method.\n- Unit tests validate true/false behavior for quiescent vs non-quiescent states.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:12.07745-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T17:25:11.529123-08:00","closed_at":"2026-02-20T17:25:11.529123-08:00","close_reason":"Added shouldApplyNextUserAction quiescence scheduler abstraction, loop call, and unit tests","dependencies":[{"issue_id":"kamera-9gz.4","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:12.078869-08:00","created_by":"Tim Goodwin"}]} -{"id":"kamera-9gz.5","title":"Integrate user-action step path into explore loop","description":"Integrate a dedicated user-action execution branch into `pkg/tracecheck/explore.go` before terminal classification:\n- execute one user action\n- apply replay/effects through existing pipeline\n- derive triggered reconciles through existing trigger manager\n- update pending reconciles and successor state\n- increment `nextUserActionIdx`\n- enqueue successor and continue\n","acceptance_criteria":"- User action execution happens in the main explore loop before terminal classification.\n- Resulting writes/effects use the same recording/application path as controller reconcile steps.\n- Trigger fanout and pending reconcile updates are produced from existing trigger manager logic.\n- Successor state increments `nextUserActionIdx` exactly once per successful action step.","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:22.25174-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:32:22.25174-08:00","dependencies":[{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:22.253717-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.1","type":"blocks","created_at":"2026-02-20T16:33:03.279095-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.2","type":"blocks","created_at":"2026-02-20T16:33:23.675835-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.3","type":"blocks","created_at":"2026-02-20T16:33:33.940463-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.4","type":"blocks","created_at":"2026-02-20T16:33:44.201368-08:00","created_by":"Tim Goodwin"}]} +{"id":"kamera-9gz.5","title":"Integrate user-action step path into explore loop","description":"Integrate a dedicated user-action execution branch into `pkg/tracecheck/explore.go` before terminal classification:\n- execute one user action\n- apply replay/effects through existing pipeline\n- derive triggered reconciles through existing trigger manager\n- update pending reconciles and successor state\n- increment `nextUserActionIdx`\n- enqueue successor and continue\n","acceptance_criteria":"- User action execution happens in the main explore loop before terminal classification.\n- Resulting writes/effects use the same recording/application path as controller reconcile steps.\n- Trigger fanout and pending reconcile updates are produced from existing trigger manager logic.\n- Successor state increments `nextUserActionIdx` exactly once per successful action step.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:22.25174-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-23T18:59:06.618338-08:00","closed_at":"2026-02-23T18:59:06.618338-08:00","close_reason":"Closed","dependencies":[{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:22.253717-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.1","type":"blocks","created_at":"2026-02-20T16:33:03.279095-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.2","type":"blocks","created_at":"2026-02-20T16:33:23.675835-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.3","type":"blocks","created_at":"2026-02-20T16:33:33.940463-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.5","depends_on_id":"kamera-9gz.4","type":"blocks","created_at":"2026-02-20T16:33:44.201368-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-9gz.6","title":"Require no remaining user actions for terminal convergence","description":"Update convergence classification so a state is terminal only when:\n- no actionable pending reconciles, and\n- no remaining user actions for that branch.\n","acceptance_criteria":"- Convergence gate checks both reconcile quiescence and user-action exhaustion.\n- States with remaining user actions are not classified as converged.\n- Regression tests cover both converged and non-converged edge cases.","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:32.593171-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:32:32.593171-08:00","dependencies":[{"issue_id":"kamera-9gz.6","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:32.59476-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.6","depends_on_id":"kamera-9gz.5","type":"blocks","created_at":"2026-02-20T16:33:54.459417-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-9gz.7","title":"Add user-action history metadata and mutating-action guardrail","description":"Ensure user action observability and guardrails:\n- append synthetic history step with `ControllerID=\"UserController\"`\n- emit required metadata for dumps/inspector explainability\n\nThis should just follow the same pattern as existing controller write paths.\n- e.g. `RecordEffect` -- if we just use the `replay.Client` then we get this\n for free.\n","acceptance_criteria":"- `ExecutionHistory` includes user-action steps labeled `ControllerID=\"UserController\"`.\n- Dump/inspector output contains enough metadata to identify which user action ran.\n- Non-mutating user actions are detected and handled per resolved v1 policy with explicit test coverage.","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:42.793492-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:43:07.13467-08:00","dependencies":[{"issue_id":"kamera-9gz.7","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:42.79491-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.7","depends_on_id":"kamera-9gz.1","type":"blocks","created_at":"2026-02-20T16:33:13.429091-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.7","depends_on_id":"kamera-9gz.5","type":"blocks","created_at":"2026-02-20T16:34:04.717193-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-9gz.8","title":"Add tracecheck tests for user-action scheduling and fanout semantics","description":"Add focused tests covering user workflow semantics in `pkg/tracecheck`:\n- quiescence scheduling behavior\n- user-action-triggered reconcile fanout\n- branch-local index progression\n- mutating-action invariant behavior\n","acceptance_criteria":"- Test suite includes deterministic cases for all four behavior classes listed above.\n- Tests fail on regressions in scheduling, fanout, progression, or no-op invariant handling.\n- `go test ./pkg/tracecheck/...` passes with the new coverage.","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:32:53.063728-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:32:53.063728-08:00","dependencies":[{"issue_id":"kamera-9gz.8","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:32:53.065158-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.8","depends_on_id":"kamera-9gz.5","type":"blocks","created_at":"2026-02-20T16:34:14.95259-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.8","depends_on_id":"kamera-9gz.6","type":"blocks","created_at":"2026-02-20T16:34:25.197422-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-9gz.8","depends_on_id":"kamera-9gz.7","type":"blocks","created_at":"2026-02-20T16:34:35.453716-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-aa8","title":"Add reconciler timeline to dump output","description":"Enhance dump.jsonl or analysis tools to produce a reconciler timeline visualization:\n- Side-by-side view of which reconciler runs at each step in each path\n- Highlight where paths diverge in reconciler ordering\n- Show effect counts per step\n- Optional: ASCII art or mermaid diagram output\n\nExample:\nStep | Path 0 | Path 1\n-----|---------------------|---------------------\n 6 | RevisionReconciler | RevisionReconciler\n 7 | RouteReconciler | RouteReconciler\n 12 | KPA | RevisionReconciler \u003c-- diverge","status":"open","priority":3,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:35.173667-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:27:35.173667-08:00"} {"id":"kamera-ata","title":"Execution-guided input generation (coverage strategy v2)","description":"Redesign the coverage/input generation pipeline from a fully-static approach to an execution-guided (closed-loop) approach. See docs/design/coverage-strategy-v2.md. The key change: instead of fabricating all initial state statically, we (1) generate seeds from user-facing CRDs only, (2) derive perturbation profiles statically from hotspot analysis, (3) run a deterministic baseline per seed, (4) scan the baseline trace for interesting checkpoints, and (5) branch from checkpoints with perturbations applied. This replaces the original plan of fabricating intermediate objects and event histories before execution.","status":"tombstone","priority":1,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-02-11T19:43:46.329542-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T10:00:16.435562-08:00"} -{"id":"kamera-b1r","title":"Add StateBefore comparison to analysis + explore readset instrumentation","description":"## Problem\n\nThe last write analysis tells us WHO wrote a differing object, but not WHY they wrote different values. To understand the root cause, we need to compare what each controller SAW (StateBefore) when it made its write.\n\nCurrently this requires using the Go API directly - the CLI doesn't expose StateBefore diffs.\n\n## Short-term: CLI Enhancement\n\nAdd StateBefore comparison to the report output:\n- For each differing object, show what the last-writing controller saw in each path\n- Highlight differences in StateBefore that might explain the divergent writes\n\n## Longer-term: Readset Instrumentation\n\nCurrently we capture writesets via EffectRecorder, but we don't capture readsets (what a controller actually read during reconciliation). StateBefore is a proxy - it's the full state available to the controller - but not what it actually accessed.\n\nConsider instrumenting controllers to capture actual reads:\n- Which objects were Get()'d or List()'d\n- Which fields were accessed\n- This would make \"why did this differ?\" analysis much more precise\n\n## Discussion Points\n\n- How invasive would readset instrumentation be?\n- Could we wrap the client.Client interface to intercept reads?\n- Trade-off: precision vs instrumentation complexity\n- Could be useful for other analyses (dependency inference, minimal state for replay)","status":"open","priority":1,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T17:01:59.480026-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T17:01:59.480026-08:00"} +{"id":"kamera-b1r","title":"Add StateBefore comparison to analysis + explore readset instrumentation","description":"## Problem\n\nThe last write analysis tells us WHO wrote a differing object, but not WHY they wrote different values. To understand the root cause, we need to compare what each controller SAW (StateBefore) when it made its write.\n\nCurrently this requires using the Go API directly - the CLI doesn't expose StateBefore diffs.\n\n## Short-term: CLI Enhancement\n\nAdd StateBefore comparison to the report output:\n- For each differing object, show what the last-writing controller saw in each path\n- Highlight differences in StateBefore that might explain the divergent writes\n\n## Longer-term: Readset Instrumentation\n\nCurrently we capture writesets via EffectRecorder, but we don't capture readsets (what a controller actually read during reconciliation). StateBefore is a proxy - it's the full state available to the controller - but not what it actually accessed.\n\nConsider instrumenting controllers to capture actual reads:\n- Which objects were Get()'d or List()'d\n- Which fields were accessed\n- This would make \"why did this differ?\" analysis much more precise\n\n## Discussion Points\n\n- How invasive would readset instrumentation be?\n- Could we wrap the client.Client interface to intercept reads?\n- Trade-off: precision vs instrumentation complexity\n- Could be useful for other analyses (dependency inference, minimal state for replay)","status":"closed","priority":1,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T17:01:59.480026-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-23T19:01:02.075793-08:00","closed_at":"2026-02-23T19:01:02.075793-08:00","close_reason":"Closed"} {"id":"kamera-bag","title":"Crossplane example hits max depth due to no-op CompositionRevision status updates","description":"Repro: examples/crossplane headless run hits max depth with alternating CompositionReconciler/CompositionRevisionReconciler. Dump shows repeated CompositionRevision UPDATEs with identical semantic hash (no deltas) after initial update. Likely loop caused by Status().Update emitting no-op updates (resourceVersion/managedFields only), which re-enqueue via watch/ownerRef. Need to determine if this is a Crossplane behavior in real apiserver or a Kamera modeling artifact.\\n\\nEvidence: /tmp/kamera-crossplane-depth100.jsonl (steps 6+ update same hash).\\n\\nNext steps:\\n- Verify APIServer behavior for no-op Status().Update (does RV change / watch event fire).\\n- If yes, Crossplane bug: add guard to skip Status().Update when conditions unchanged.\\n- If no, adjust Kamera to drop no-op write effects from triggering.\\n","status":"open","priority":0,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-21T18:12:58.827302-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-21T18:12:58.827302-08:00","comments":[{"id":3,"issue_id":"kamera-bag","author":"Tim Goodwin","text":"Repro command:\n GOCACHE=/Users/tgoodwin/projects/kamera/.cache/go-build go run . -depth 100 -interactive=false -dump-output /tmp/kamera-crossplane-depth100.jsonl -log-level info -emit-stats (from examples/crossplane)\nInspect:\n go run ./cmd/inspect --dump /tmp/kamera-crossplane-depth100.jsonl --interactive=false\nEvidence: repeated CompositionRevision UPDATE effects with same hash after first update (steps 6+).","created_at":"2026-01-22T02:25:37Z"}]} {"id":"kamera-bcq","title":"Add terminal state diff to divergence analysis output","description":"When multiple converged states exist, automatically diff the objects that have different hashes to show exactly which fields differ (like ServerlessService.Ready status)","status":"open","priority":2,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T22:30:26.294078-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T22:30:26.294078-08:00"} {"id":"kamera-bj5","title":"make karpenter provisioner batch timing deterministic in kamera harness","description":"Provisioner batching currently uses wall-clock timers (clock.RealClock plus Batcher.Wait timers), while singleton enqueue uses simclock. Update harness wiring so provisioner batch timing is deterministic under exploration depth/time and add regression coverage.","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T19:14:20.294796-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-17T19:14:20.294796-08:00"} diff --git a/pkg/analysis/diff.go b/pkg/analysis/diff.go index 0d35cd4..13e7acf 100644 --- a/pkg/analysis/diff.go +++ b/pkg/analysis/diff.go @@ -62,32 +62,33 @@ func DiffConvergedStates(dump *Dump) *ConvergedStateDiff { for key := range allKeys { byState := make(map[string]snapshot.VersionHash) - var firstHash snapshot.VersionHash - firstSet := false + var baselineHash snapshot.VersionHash + baselineSet := false differs := false + presentCount := 0 - for stateID, objMap := range stateObjects { + // Iterate states in dump order (not map order) for deterministic behavior. + for _, state := range dump.States { + stateID := state.ID + objMap := stateObjects[stateID] hash, exists := objMap[key] if exists { byState[stateID] = hash - if !firstSet { - firstHash = hash - firstSet = true - } else if hash.Value != firstHash.Value { + presentCount++ + if !baselineSet { + baselineHash = hash + baselineSet = true + } else if hash.Value != baselineHash.Value { differs = true } } else { - // Object missing in this state - represents a difference byState[stateID] = snapshot.VersionHash{} // empty hash - if firstSet { - differs = true - } } } - // If we never set firstHash but have entries, that means all states - // are missing this key (shouldn't happen, but handle it) - if !firstSet && len(byState) > 0 { + // A key is differing if it is missing in at least one state. + // allKeys is built from a union of state objects, so presentCount will be >0. + if presentCount != len(dump.States) { differs = true } From 1721a227bc0af5286c9e4bc7717fde5f7f0c7d05 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Wed, 25 Feb 2026 16:45:06 -0800 Subject: [PATCH 22/29] update karpenter tests --- .beads/issues.jsonl | 1 + examples/karpenter/README.md | 36 ++++++- examples/karpenter/inputs.json | 35 +++---- examples/karpenter/main.go | 91 +++++++++++++++-- examples/karpenter/scenario.go | 145 ++++++++++++++++++++-------- examples/karpenter/scenario_test.go | 92 ++++++++++++++++-- examples/karpenter/smoke_test.go | 4 +- 7 files changed, 323 insertions(+), 81 deletions(-) diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 3230f11..bb86bfc 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -16,6 +16,7 @@ {"id":"kamera-6xp","title":"Create pending queue analyzer for dump files","description":"Build a tool that analyzes the pending reconcile queue dynamics:\n- Show when each reconciler enters/exits the pending queue\n- Compare queue state across paths at equivalent points\n- Identify reconcilers that are triggered differently between paths\n- Highlight timing differences (e.g., 'EC in queue when Pod Ready' vs not)\n\nThis helps diagnose ordering-dependent divergences.","status":"open","priority":3,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:34.838358-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:27:34.838358-08:00"} {"id":"kamera-7tl","title":"Backward-trace divergence analysis framework","description":"A composable framework for analyzing divergence in kamera dump files, using backward-trace methodology.\n\n## Design Philosophy\n\nStart from known divergence (final state differences) and trace backwards to root cause. This avoids noise from 'diamond' patterns where paths diverge then reconverge.\n\n## Architecture\n\nThree composable modules in pkg/analysis/:\n\n**Module 0: Converged State Diff** (kamera-pwm)\n- Input: dump.jsonl\n- Output: Objects that differ between converged states\n- Answers: 'In what ways do the final states differ?'\n\n**Module 1: Last Write Analysis** (new task)\n- Input: dump.jsonl + differing object keys\n- Output: For each object+path, the step that produced its final value + reconciler's input state\n- Answers: 'What did the reconciler see when it wrote this value?'\n\n**Module 2: Object Lifecycle Analysis** (kamera-gwc)\n- Input: dump.jsonl + path + object + target hash\n- Output: All steps where object had that hash\n- Answers: 'Does this object take on this value at some point in this path?'\n\n## Composition Flow\n\n1. Run Module 0 to identify differing objects\n2. Run Module 1 on each differing object to find last writes and input diffs\n3. If inputs differ, run Module 2 to check 'does the missing state appear later?'\n4. If yes, points to watch/trigger issue; if no, points to state dependency\n\n## Package Structure\n\npkg/analysis/\n├── types.go # Shared types (moved from pkg/interactive)\n├── dump.go # Dump loading utilities \n├── diff.go # Module 0\n├── lastwrite.go # Module 1\n├── lifecycle.go # Module 2\n\n## Validation\n\nUse trial-1 dump as test fixture - should identify EndpointsController/Pod.Ready issue.","status":"open","priority":1,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T14:06:33.625361-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T14:06:33.625361-08:00"} {"id":"kamera-8ro","title":"Add step-level metadata to dump output","description":"Enhance dump.jsonl with additional per-step metadata useful for analysis:\n- Add 'triggeredBy' field showing what caused this reconciler to run\n- Add 'stateHashBefore'/'stateHashAfter' for quick comparison\n- Include pending queue snapshot at each step (currently only at end)\n- Add timing/depth information\n\nThis makes it easier to understand why reconcilers ran in a particular order.","status":"open","priority":3,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:48.366206-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:27:48.366206-08:00"} +{"id":"kamera-8y6","title":"Audit and fix CleanupReconciler requeue behavior after DELETE-with-finalizer patches","description":"Background\n- Cleanup scheduling currently enqueues CleanupReconciler only on event.MARK_FOR_DELETION paths in pkg/tracecheck/trigger.go.\n- In Kubernetes, DELETE sets deletionTimestamp, and finalizers can block final removal.\n- We are observing a flow where after MARK_FOR_DELETION, a controller emits PATCH/UPDATE that removes finalizers, but CleanupReconciler is not re-invoked and removal does not occur.\n\nObjective\n- Ensure CleanupReconciler is queued again on delete-scope mutations after an object is marked for deletion and finalizers have been removed (or deletion can otherwise proceed).\n\nScope\n- Update trigger scheduling semantics for CleanupReconciler requeue conditions.\n- Keep non-deletion object updates from causing cleanup requeues.\n- Add focused tests around getTriggered (or equivalent) capturing delete then finalizer-removal PATCH flow.\n- Validate via existing integration traces that CleanupReconciler emits REMOVE at the expected step.\n\nAcceptance\n- A non-deletion object update does not requeue CleanupReconciler.\n- After MARK_FOR_DELETION, a subsequent PATCH/UPDATE on that object with deletionTimestamp set and cleanup-eligible state triggers a CleanupReconciler pending reconcile for the same object.\n- Regression for this scenario is covered by a test and remains deterministic.\n\nDesign notes\n- Candidate: in pkg/tracecheck/trigger.go, add an explicit path in getTriggered for mutation effects on deletionTimestamped objects that are cleanup-eligible.\n- Preserve existing MARK_FOR_DELETION handling and add this as additive logic.\n\nObserved reference\n- sleevectrl/pkg/controller/sleeve_finalizer_controller.go\n- pkg/tracecheck/trigger.go\n- pkg/tracecheck/explorebuilder.go\n","status":"open","priority":0,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-24T17:20:24.784251-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-24T17:21:08.454411-08:00","labels":["cleanup-reconciler","tracecheck"]} {"id":"kamera-917","title":"Assess if completedPathDedup optimization is subsumed by SubtreeCompletion","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T12:57:19.74152-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T12:57:19.74152-08:00","dependencies":[{"issue_id":"kamera-917","depends_on_id":"kamera-qrs","type":"parent-child","created_at":"2026-01-13T12:58:51.164052-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-9gz","title":"UserController first-class workflow integration in tracecheck","description":"Implement the design in `docs/plans/2026-02-21-user-controller-workflow-design.md` so user workflows become first-class in `pkg/tracecheck/explore.go`.\n\nScope for this epic:\n- model one `UserController` actor per `Explorer`\n- execute ordered user actions interleaved with reconcile expansion\n- track branch-local user progress via `StateNode.nextUserActionIdx`\n- preserve replay/effect semantics and trigger behavior\n- make user steps visible in history/dumps and convergence checks\n","acceptance_criteria":"- All child tasks are merged with tests passing via `go test ./pkg/tracecheck/...` and relevant analyze/inspect packages.\n- Exploration can represent user actions as in-engine steps, not just outer scenario phases.\n- Terminal convergence requires both reconcile quiescence and no remaining user actions.","status":"open","priority":1,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:31:30.892404-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:31:30.892404-08:00"} {"id":"kamera-9gz.1","title":"Resolve v1 policy decisions for user-action semantics","description":"Turn the three open questions in the design doc into explicit v1 policies and lock them into implementation guidance:\n1) non-mutating user action behavior,\n2) history frame representation for user steps,\n3) branch behavior when multiple states are quiescent.\n\nDocument decisions in the design doc and align acceptance tests to those decisions.\n","acceptance_criteria":"- `docs/plans/2026-02-21-user-controller-workflow-design.md` contains resolved v1 decisions for all open questions.\n- Chosen policy is reflected as concrete expected behavior statements used by implementation/tests.\n- Any intentionally deferred choices are explicitly labeled `post-v1`.","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T16:31:41.229276-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T16:49:21.348767-08:00","closed_at":"2026-02-20T16:49:21.348767-08:00","close_reason":"Closed","dependencies":[{"issue_id":"kamera-9gz.1","depends_on_id":"kamera-9gz","type":"parent-child","created_at":"2026-02-20T16:31:41.230387-08:00","created_by":"Tim Goodwin"}]} diff --git a/examples/karpenter/README.md b/examples/karpenter/README.md index bcdde45..17eb5e1 100644 --- a/examples/karpenter/README.md +++ b/examples/karpenter/README.md @@ -17,11 +17,13 @@ go run . -interactive=false -dump-output /tmp/kamera-karpenter.jsonl ## Batch inputs -To run a generated inputs file, pass `--inputs` and set dump directories for per-scenario output. For example: +`--inputs` defaults to `input-example.json` in the example directory. +To run a custom file, pass `--inputs ` and set dump directories for per-scenario output. +For example: ```bash go run . \ - --inputs inputs.json \ + --inputs input-example.json \ --fuzz-cases 12 \ --fuzz-seed 1337 \ --dump-output /tmp/karpenter-dumps \ @@ -31,6 +33,36 @@ go run . \ - `--fuzz-cases` controls how many sampled parameterized variants are generated per input. - `--fuzz-seed` keeps sampled variants deterministic across runs. +## Input model (`environmentState` vs `userInputs`) + +`input-example.json` models this scenario as: +- **Environment state:** a ready `TestNodeClass` and `NodePool` that represent cluster configuration. +- **User action:** creating one unschedulable pending `Pod`. + +This mirrors typical Karpenter behavior where provisioning starts when unschedulable +pods appear and Karpenter computes capacity from existing NodePool/NodeClass +configuration. + +## Startup Semantics (Important) + +In this harness, `environmentState` means "objects already exist in the API state" +but does **not** automatically mean every controller-local cache has already been +hydrated. + +Karpenter uses process-local in-memory state (for example, shared `state.Cluster`) +that is populated by reconcilers such as `state.nodepool`. Because of that: + +- A reconcile can be behaviorally required even if it writes no API objects. +- `state.nodepool` may look like a no-op in trace effects, but it can still be + needed so `provisioner` sees NodePools in local cluster state. +- If startup triggers for environment objects are skipped, `provisioner` can + converge early (for example, "no dynamic nodepools found") even though a + NodePool object exists in `environmentState`. + +Design intent: users should model inputs via `environmentState` + `userInputs` +without hand-authoring pending reconciles. The harness should derive startup +pending work from simulation semantics (subscriptions/watches/dependencies). + ## Observed flow (expected) 1. `state.pod` records the pending pod in cluster state. diff --git a/examples/karpenter/inputs.json b/examples/karpenter/inputs.json index b901b13..55ea467 100644 --- a/examples/karpenter/inputs.json +++ b/examples/karpenter/inputs.json @@ -2,19 +2,8 @@ { "name": "karpenter-default", "environmentState": { - "objects": [] - }, - "tuning": { - "maxDepth": 0, - "permuteControllers": null, - "staleReads": null, - "staleLookback": null - }, - "userInputs": [ - { - "id": "", - "type": "CREATE", - "object": { + "objects": [ + { "apiVersion": "karpenter.test.sh/v1alpha1", "kind": "TestNodeClass", "metadata": { @@ -39,12 +28,8 @@ } ] } - } - }, - { - "id": "", - "type": "CREATE", - "object": { + }, + { "apiVersion": "karpenter.sh/v1", "kind": "NodePool", "metadata": { @@ -115,9 +100,17 @@ "nodes": null } } - }, + ] + }, + "tuning": { + "maxDepth": 50, + "permuteControllers": null, + "staleReads": null, + "staleLookback": null + }, + "userInputs": [ { - "id": "", + "id": "create pending pod", "type": "CREATE", "object": { "apiVersion": "v1", diff --git a/examples/karpenter/main.go b/examples/karpenter/main.go index 52ddb00..43581cd 100644 --- a/examples/karpenter/main.go +++ b/examples/karpenter/main.go @@ -2,9 +2,12 @@ package main import ( "context" + "errors" "flag" "fmt" "os" + "path/filepath" + "strings" "github.com/tgoodwin/kamera/pkg/coverage" "github.com/tgoodwin/kamera/pkg/explore" @@ -14,6 +17,7 @@ var fuzzCasesFlag = flag.Int("fuzz-cases", 12, "number of sampled parameterized var fuzzSeedFlag = flag.Int64("fuzz-seed", 1337, "seed for deterministic sampled parameterized scenario generation") func main() { + setDefaultKarpenterInputsFlag() flag.Parse() ctx := context.Background() @@ -27,15 +31,23 @@ func main() { builder.SetConfig(loadedCfg) } - if inputsPath := explore.InputsPath(); inputsPath != "" { - if explore.InteractiveEnabled() { - fmt.Fprintln(os.Stderr, "interactive ignored in batch mode") - } - inputs, err := coverage.LoadInputs(inputsPath) - if err != nil { - fmt.Fprintf(os.Stderr, "load inputs: %v\n", err) + inputsPath := strings.TrimSpace(explore.InputsPath()) + inputs, err := loadInputsForRun(inputsPath) + if err != nil { + fmt.Fprintf(os.Stderr, "load inputs: %v\n", err) + os.Exit(1) + } + + if !explore.InteractiveEnabled() { + if len(inputs) == 0 { + if inputsPath != "" { + fmt.Fprintf(os.Stderr, "batch mode requires valid --inputs input file, got %q\n", inputsPath) + } else { + fmt.Fprintf(os.Stderr, "batch mode requires --inputs\n") + } os.Exit(1) } + scenarios, err := scenariosFromInputs(builder, inputs) if err != nil { fmt.Fprintf(os.Stderr, "convert inputs: %v\n", err) @@ -54,14 +66,75 @@ func main() { return } - initialState := buildInitialKarpenterState(builder) + if len(inputs) == 0 { + fmt.Fprintf(os.Stderr, "interactive mode requires one input file: either --inputs or default %q\n", defaultKarpenterInputsSearchPaths[0]) + os.Exit(1) + } + if len(inputs) != 1 { + fmt.Fprintf(os.Stderr, "expected a single input for interactive mode, got %d\n", len(inputs)) + os.Exit(1) + } + + builder.SetConfig(applyInputTuning(builder.Config(), inputs[0].Tuning)) + + initialState, seededObjects, err := buildStateFromCoverageInput(builder, inputs[0]) + if err != nil { + fmt.Fprintf(os.Stderr, "build initial state from input: %v\n", err) + os.Exit(1) + } + userActions, err := buildUserActionsFromCoverageInput(inputs[0], seededObjects) + if err != nil { + fmt.Fprintf(os.Stderr, "build user actions from input: %v\n", err) + os.Exit(1) + } + runner, err := explore.NewRunner(builder) if err != nil { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) os.Exit(1) } - if err := runner.Run(ctx, explore.RunInput{EnvironmentState: initialState}); err != nil { + + if err := runner.Run(ctx, explore.RunInput{EnvironmentState: initialState, UserActions: userActions}); err != nil { fmt.Fprintf(os.Stderr, "session error: %v\n", err) os.Exit(1) } } + +func setDefaultKarpenterInputsFlag() { + inputsPath, err := defaultKarpenterInputsPath() + if err != nil { + return + } + + _ = flag.CommandLine.Set("inputs", inputsPath) +} + +func loadInputsForRun(inputsPath string) ([]coverage.Input, error) { + if inputsPath == "" { + return nil, nil + } + + inputs, err := coverage.LoadInputs(inputsPath) + if err != nil { + return nil, fmt.Errorf("load inputs from %s: %w", inputsPath, err) + } + return inputs, nil +} + +func defaultKarpenterInputsPath() (string, error) { + for _, path := range defaultKarpenterInputsSearchPaths { + _, err := os.Stat(path) + if err == nil { + return path, nil + } + if !errors.Is(err, os.ErrNotExist) { + return "", fmt.Errorf("check default inputs path %s: %w", path, err) + } + } + return "", fmt.Errorf("default inputs file not found in search paths %v: %w", defaultKarpenterInputsSearchPaths, os.ErrNotExist) +} + +var defaultKarpenterInputsSearchPaths = []string{ + "input-example.json", + filepath.Join("examples", "karpenter", "input-example.json"), +} diff --git a/examples/karpenter/scenario.go b/examples/karpenter/scenario.go index 88352a8..3f0b298 100644 --- a/examples/karpenter/scenario.go +++ b/examples/karpenter/scenario.go @@ -25,7 +25,7 @@ import ( "sigs.k8s.io/karpenter/pkg/test/v1alpha1" ) -func newScenarioObjects() ([]client.Object, error) { +func newEnvironmentObjects() ([]client.Object, error) { // TestNodeClass (fake cloud provider) nc := test.NodeClass(v1alpha1.TestNodeClass{ObjectMeta: metav1.ObjectMeta{Name: "default", UID: types.UID("testnodeclass-uid")}}) tag.AddSleeveObjectID(nc) @@ -34,6 +34,10 @@ func newScenarioObjects() ([]client.Object, error) { np := test.NodePool(v1.NodePool{ObjectMeta: metav1.ObjectMeta{Name: "default", UID: types.UID("nodepool-uid")}}) tag.AddSleeveObjectID(np) + return []client.Object{nc, np}, nil +} + +func newPendingPod() *corev1.Pod { // Provisionable Pod (PodScheduled=False, Reason=Unschedulable) pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "pending", Namespace: "default", UID: types.UID("pod-uid")}, @@ -45,27 +49,30 @@ func newScenarioObjects() ([]client.Object, error) { }}}, } tag.AddSleeveObjectID(pod) + return pod +} - return []client.Object{nc, np, pod}, nil +func newInitialUserActions() []tracecheck.UserAction { + return []tracecheck.UserAction{ + { + ID: "create-pending-pod", + OpType: event.CREATE, + Payload: newPendingPod(), + }, + } } func buildInitialKarpenterState(builder *tracecheck.ExplorerBuilder) tracecheck.StateNode { stateBuilder := builder.NewStateEventBuilder() - objs, _ := newScenarioObjects() + objs, _ := newEnvironmentObjects() nc := objs[0] np := objs[1] - pod := objs[2] - // Trigger pod-related controllers at start. - // NOTE: We explicitly enqueue the provisioner once to simulate the singleton reconcile loop - // firing at least once in the DFS. This approximates the real ticker-driven trigger. - podState := stateBuilder.AddTopLevelObject(pod, "state.pod", "provisioner.trigger.pod", "provisioner") poolState := stateBuilder.AddTopLevelObject(np, "state.nodepool") classState := stateBuilder.AddTopLevelObject(nc) - state := tracecheck.MergeStateNodes(podState, poolState) - return tracecheck.MergeStateNodes(state, classState) + return tracecheck.MergeStateNodes(poolState, classState) } func scenariosFromInputs(builder *tracecheck.ExplorerBuilder, inputs []coverage.Input) ([]explore.Scenario, error) { @@ -129,7 +136,7 @@ func expandKarpenterParameterizedInput(input coverage.Input, fuzzCases int, fuzz base.Name = baseName + "/base" podIdx := findKarpenterPodInUserInputs(base.UserInputs) - nodePoolIdx := findKarpenterNodePoolInUserInputs(base.UserInputs) + nodePoolIdx := findKarpenterNodePool(base.EnvironmentState.Objects) if podIdx < 0 || nodePoolIdx < 0 { return []coverage.Input{base}, nil } @@ -138,7 +145,8 @@ func expandKarpenterParameterizedInput(input coverage.Input, fuzzCases int, fuzz if err != nil { return nil, err } - templateNodePool, err := unstructuredToNodePool(base.UserInputs[nodePoolIdx].Object) + nodePoolObject := base.EnvironmentState.Objects[nodePoolIdx] + templateNodePool, err := unstructuredToNodePool(nodePoolObject) if err != nil { return nil, err } @@ -286,7 +294,10 @@ func buildKarpenterVariantInput(input coverage.Input, podIdx int, nodePoolIdx in return coverage.Input{}, fmt.Errorf("convert nodepool for %q: %w", name, err) } updated.UserInputs[podIdx].Object = podObj - updated.UserInputs[nodePoolIdx].Object = nodePoolObj + if nodePoolIdx < 0 || nodePoolIdx >= len(updated.EnvironmentState.Objects) { + return coverage.Input{}, fmt.Errorf("nodepool environment index %d out of range", nodePoolIdx) + } + updated.EnvironmentState.Objects[nodePoolIdx] = nodePoolObj return updated, nil } @@ -364,8 +375,17 @@ func buildStateFromCoverageInput(builder *tracecheck.ExplorerBuilder, input cove } objects = append(objects, obj.DeepCopy()) } + for _, userInput := range input.UserInputs { + if userInput.Type != event.CREATE || userInput.Object == nil || !isKarpenterPod(userInput.Object) { + continue + } + if isInputObjectSeeded(userInput.Object, objects) { + continue + } + objects = append(objects, userInput.Object.DeepCopy()) + } if len(objects) == 0 { - fallback, err := newScenarioObjects() + fallback, err := newEnvironmentObjects() if err != nil { return tracecheck.StateNode{}, nil, err } @@ -375,14 +395,79 @@ func buildStateFromCoverageInput(builder *tracecheck.ExplorerBuilder, input cove return tracecheck.StateNode{}, nil, fmt.Errorf("input has no objects") } - pending := make([]tracecheck.PendingReconcile, 0) - state, err := builder.BuildStartStateFromObjects(objects, pending) + state, err := buildStateFromObjects(builder, objects) if err != nil { return tracecheck.StateNode{}, nil, err } return state, objects, nil } +func buildStateFromObjects(builder *tracecheck.ExplorerBuilder, objects []client.Object) (tracecheck.StateNode, error) { + if builder == nil { + return tracecheck.StateNode{}, fmt.Errorf("builder is nil") + } + if len(objects) == 0 { + return tracecheck.StateNode{}, fmt.Errorf("no objects supplied") + } + + stateBuilder := builder.NewStateEventBuilder() + ordered := orderInitialStateObjects(objects) + + var ( + state tracecheck.StateNode + seeded bool + ) + for _, obj := range ordered { + if obj == nil { + continue + } + next := stateBuilder.AddTopLevelObject(obj, initialDependentControllers(obj)...) + if !seeded { + state = next + seeded = true + continue + } + state = tracecheck.MergeStateNodes(state, next) + } + if !seeded { + return tracecheck.StateNode{}, fmt.Errorf("no non-nil objects supplied") + } + return state, nil +} + +func orderInitialStateObjects(objects []client.Object) []client.Object { + ordered := make([]client.Object, 0, len(objects)) + + appendKind := func(match func(client.Object) bool) { + for _, obj := range objects { + if obj == nil || !match(obj) { + continue + } + ordered = append(ordered, obj) + } + } + + appendKind(isKarpenterPod) + appendKind(isKarpenterNodePool) + appendKind(func(obj client.Object) bool { + return !isKarpenterPod(obj) && !isKarpenterNodePool(obj) + }) + return ordered +} + +func initialDependentControllers(obj client.Object) []tracecheck.ReconcilerID { + if obj == nil { + return nil + } + if isKarpenterPod(obj) { + return []tracecheck.ReconcilerID{"state.pod", "provisioner.trigger.pod", "provisioner"} + } + if isKarpenterNodePool(obj) { + return []tracecheck.ReconcilerID{"state.nodepool"} + } + return nil +} + func applyInputTuning(base tracecheck.ExploreConfig, tuning coverage.InputTuning) tracecheck.ExploreConfig { cfg := base.Clone() if tuning.MaxDepth > 0 { @@ -432,6 +517,10 @@ func buildUserActionsFromCoverageInput(input coverage.Input, seededObjects []cli } opType := userInput.Type if opType == event.CREATE && isInputObjectSeeded(userInput.Object, seededObjects) { + if isKarpenterPod(userInput.Object) { + // Seed pod creates into the initial state so Karpenter provisioning runs before pod lifecycle scheduling. + continue + } opType = event.UPDATE } actions = append(actions, tracecheck.UserAction{ @@ -513,18 +602,6 @@ func cloneIntMap(in map[string]int) map[string]int { return out } -func findKarpenterPod(objects []*unstructured.Unstructured) int { - for idx, obj := range objects { - if obj == nil { - continue - } - if isKarpenterPod(obj) { - return idx - } - } - return -1 -} - func findKarpenterPodInUserInputs(userInputs []coverage.UserInput) int { for idx, input := range userInputs { if input.Object == nil { @@ -537,18 +614,6 @@ func findKarpenterPodInUserInputs(userInputs []coverage.UserInput) int { return -1 } -func findKarpenterNodePoolInUserInputs(userInputs []coverage.UserInput) int { - for idx, input := range userInputs { - if input.Object == nil { - continue - } - if isKarpenterNodePool(input.Object) { - return idx - } - } - return -1 -} - func findKarpenterNodePool(objects []*unstructured.Unstructured) int { for idx, obj := range objects { if obj == nil { diff --git a/examples/karpenter/scenario_test.go b/examples/karpenter/scenario_test.go index 82eb2b8..d46c213 100644 --- a/examples/karpenter/scenario_test.go +++ b/examples/karpenter/scenario_test.go @@ -5,6 +5,7 @@ import ( "testing" "github.com/tgoodwin/kamera/pkg/coverage" + "github.com/tgoodwin/kamera/pkg/event" "github.com/tgoodwin/kamera/pkg/explore" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" @@ -99,11 +100,11 @@ func TestExpandKarpenterParameterizedInputAddsNoFitNodeSelectorVariant(t *testin if variant.Name != target { continue } - podIdx := findKarpenterPod(variant.EnvironmentState.Objects) + podIdx := findKarpenterPodInUserInputs(variant.UserInputs) if podIdx < 0 { - t.Fatalf("variant %q missing pod object", target) + t.Fatalf("variant %q missing pod user input", target) } - pod, err := unstructuredToPod(variant.EnvironmentState.Objects[podIdx]) + pod, err := unstructuredToPod(variant.UserInputs[podIdx].Object) if err != nil { t.Fatalf("convert pod variant: %v", err) } @@ -116,27 +117,104 @@ func TestExpandKarpenterParameterizedInputAddsNoFitNodeSelectorVariant(t *testin t.Fatalf("expected variant %q, got names=%v", target, coverageInputNames(variants)) } +func TestExpandKarpenterParameterizedInputAddsNoFitNodePoolRequirementVariant(t *testing.T) { + input := mustKarpenterInput(t, "karpenter-params") + variants, err := expandKarpenterParameterizedInput(input, 0, 99) + if err != nil { + t.Fatalf("expandKarpenterParameterizedInput error = %v", err) + } + + target := "karpenter-params/single/no-fit-nodepool-requirement-unmatched" + for _, variant := range variants { + if variant.Name != target { + continue + } + nodePoolIdx := findKarpenterNodePool(variant.EnvironmentState.Objects) + if nodePoolIdx < 0 { + t.Fatalf("variant %q missing nodepool environment object", target) + } + nodePool, err := unstructuredToNodePool(variant.EnvironmentState.Objects[nodePoolIdx]) + if err != nil { + t.Fatalf("convert nodepool variant: %v", err) + } + for _, req := range nodePool.Spec.Template.Spec.Requirements { + if req.Key == "karpenter.sh/nonexistent-capability" { + return + } + } + t.Fatalf("expected unmatched requirement in %q, got requirements=%v", target, nodePool.Spec.Template.Spec.Requirements) + } + + t.Fatalf("expected variant %q, got names=%v", target, coverageInputNames(variants)) +} + +func TestBuildStateFromCoverageInputSeedsNodePoolPendingReconcile(t *testing.T) { + builder := newKarpenterExplorerBuilder() + input := mustKarpenterInput(t, "karpenter-state-seed") + + state, _, err := buildStateFromCoverageInput(builder, input) + if err != nil { + t.Fatalf("buildStateFromCoverageInput error = %v", err) + } + + for _, pending := range state.PendingReconciles { + if pending.ReconcilerID == "state.nodepool" && pending.Request.Name == "default" { + return + } + } + + t.Fatalf("expected state.nodepool pending reconcile for default NodePool, got pending=%v", state.PendingReconciles) +} + +func TestBuildUserActionsFromCoverageInputSkipsSeededPodCreate(t *testing.T) { + builder := newKarpenterExplorerBuilder() + input := mustKarpenterInput(t, "karpenter-user-actions") + + _, seeded, err := buildStateFromCoverageInput(builder, input) + if err != nil { + t.Fatalf("buildStateFromCoverageInput error = %v", err) + } + actions, err := buildUserActionsFromCoverageInput(input, seeded) + if err != nil { + t.Fatalf("buildUserActionsFromCoverageInput error = %v", err) + } + if len(actions) != 0 { + t.Fatalf("expected no user actions after seeding pod create into initial state, got %d actions: %#v", len(actions), actions) + } +} + func mustKarpenterInput(t *testing.T, name string) coverage.Input { t.Helper() - objs, err := newScenarioObjects() + envObjs, err := newEnvironmentObjects() if err != nil { - t.Fatalf("newScenarioObjects: %v", err) + t.Fatalf("newEnvironmentObjects: %v", err) } - objects := make([]*unstructured.Unstructured, 0, len(objs)) - for _, obj := range objs { + objects := make([]*unstructured.Unstructured, 0, len(envObjs)) + for _, obj := range envObjs { u, err := objectToUnstructured(obj) if err != nil { t.Fatalf("convert object %T: %v", obj, err) } objects = append(objects, u) } + podObj, err := objectToUnstructured(newPendingPod()) + if err != nil { + t.Fatalf("convert pod object: %v", err) + } return coverage.Input{ Name: name, EnvironmentState: coverage.EnvironmentState{ Objects: objects, }, + UserInputs: []coverage.UserInput{ + { + ID: "create-pending-pod", + Type: event.CREATE, + Object: podObj, + }, + }, } } diff --git a/examples/karpenter/smoke_test.go b/examples/karpenter/smoke_test.go index 85b2cfb..bf04ca8 100644 --- a/examples/karpenter/smoke_test.go +++ b/examples/karpenter/smoke_test.go @@ -3,7 +3,7 @@ package main import "testing" func TestScenarioBuilds(t *testing.T) { - if _, err := newScenarioObjects(); err != nil { - t.Fatalf("expected scenario objects to build: %v", err) + if _, err := newEnvironmentObjects(); err != nil { + t.Fatalf("expected environment objects to build: %v", err) } } From db80faeb238844dc24abd5a8eaaa10f58f2038e0 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Wed, 25 Feb 2026 18:00:54 -0800 Subject: [PATCH 23/29] update crossplane harness to follow user action input flow --- .beads/issues.jsonl | 8 +- examples/crossplane/go.mod | 32 +-- examples/crossplane/go.sum | 61 ++-- examples/crossplane/scenario.go | 166 ++++++++++- examples/crossplane/scenario_test.go | 148 +++++++++- examples/crossplane/two-step-workflow.json | 306 +++++++++++++++++++++ 6 files changed, 658 insertions(+), 63 deletions(-) create mode 100644 examples/crossplane/two-step-workflow.json diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index bb86bfc..e7b856a 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -1,4 +1,4 @@ -{"id":"kamera-07e","title":"issue","description":"Debug Karpenter example\n","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-23T18:58:07.650966-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-23T18:58:31.451848-08:00"} +{"id":"kamera-07e","title":"issue","description":"Debug Karpenter example\n","status":"closed","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-23T18:58:07.650966-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:43:56.737489-08:00","closed_at":"2026-02-25T17:43:56.737489-08:00","close_reason":"Closed"} {"id":"kamera-0cl","title":"audit Apply semantics","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-22T17:53:50.031508-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-22T17:53:50.031508-08:00"} {"id":"kamera-0lr","title":"Expand Karpenter Harness Input Workflows","description":"Apply a Knative-style multi-workflow input expansion to the Karpenter example harness, starting with small deterministic workflow variants.","status":"closed","priority":2,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T19:33:05.975494-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-17T19:47:52.010981-08:00","closed_at":"2026-02-17T19:47:52.010981-08:00","close_reason":"Closed"} {"id":"kamera-0lr.1","title":"Add pod constraint variants to Karpenter harness","description":"Implement workflow variants that mutate pod requests/selectors/affinity-like constraints to broaden provisioning paths while keeping scenarios deterministic.","status":"closed","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-17T19:33:16.273655-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-17T19:47:52.006845-08:00","closed_at":"2026-02-17T19:47:52.006845-08:00","close_reason":"Closed","dependencies":[{"issue_id":"kamera-0lr.1","depends_on_id":"kamera-0lr","type":"parent-child","created_at":"2026-02-17T19:33:16.274981-08:00","created_by":"Tim Goodwin"}]} @@ -6,8 +6,10 @@ {"id":"kamera-189","title":"Move dump types from pkg/interactive to pkg/analysis","description":"**Prerequisite for backward-trace framework (kamera-7tl)**\n\nMove dump-related types from pkg/interactive to pkg/analysis:\n\n## Types to Move\n- DumpOutput\n- ConvergedStateDump \n- PathStep\n- ObjectVersion\n- Related helper types\n\n## Rationale\npkg/interactive should be purely about powering the TUI. The dump types are analysis data structures that belong in pkg/analysis.\n\n## Changes Required\n1. Create pkg/analysis/types.go with moved types\n2. Create pkg/analysis/dump.go with dump loading utilities\n3. Update pkg/interactive to import from pkg/analysis\n4. Update any other code that imports these types","status":"closed","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T14:08:11.201209-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T15:58:30.284307-08:00","closed_at":"2026-01-14T15:58:30.284307-08:00","close_reason":"Implemented in commits 6fae4a5 through a5a4a0a"} {"id":"kamera-1r1","title":"Implement checkpoint scanner","description":"Implement a checkpoint scanner that takes a baseline trace plus []PerturbationProfile and emits checkpoint candidates (checkpoint state + profile + context).\n\nScanner behavior:\n- Ordering profiles: identify states where profiled controllers are simultaneously pending (or otherwise form a meaningful reorder fork).\n- Staleness profiles: identify steps where profiled reads can be served from concrete prior versions observed in the same baseline trace.\n- A single profile may produce multiple checkpoints per trace.\n\nRequirements:\n- Consume the trace contract formalized in kamera-fv4.\n- Make predicate logic explicit and testable for edge cases (no-op steps, repeated mutations, irrelevant profiles).\n","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-11T22:06:59.145429-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:19:49.260661-08:00","dependencies":[{"issue_id":"kamera-1r1","depends_on_id":"kamera-kse","type":"blocks","created_at":"2026-02-11T22:07:30.15809-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-1r1","depends_on_id":"kamera-fv4","type":"blocks","created_at":"2026-02-11T22:07:30.170662-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-1r1","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-11T22:08:33.726299-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-25g","title":"closed-loop: fork reference version store into restart/rerun phases","description":"## Context\nClosed-loop restart/rerun execution hit a state-store mismatch when history prefixing was enabled.\n\nObserved behavior before the fix:\n- Reference run completed and wrote a dump.\n- Rerun often failed to write a dump (especially in `--parallel-processes` mode) with:\n `unable to resolve object for hash ...`\n\nRepro command used:\n`go run . --inputs /tmp/knative-one.json --dump-output /tmp/knative-dumps --depth 100 --timeout 60s --dump-stats /tmp/knative-stats`\n\nWhat was happening:\n- Rerun phases used `HistoryPrefix` from reference execution.\n- Prefix steps referenced historical object hashes that existed in the reference run's version store.\n- The rerun phase store initially contained only seeded checkpoint/current state, not full prefix history hashes.\n- Dump writing resolves hashes for full path history and failed on missing prefix hashes.\n\nCurrent stopgap behavior:\n- We added priming logic that materializes referenced prefix hashes into rerun phase stores so dumps succeed.\n- This fixes the immediate failure but preserves an indirect coupling path.\n\n## Proposed Fix (Option 1)\nAdopt an explicit \"store fork\" model for restart/rerun phases:\n- Build rerun phases from a cloned/forked snapshot of the reference run version store.\n- Treat restart runs as a child continuation of the reference store lineage, instead of reconstructing needed hashes ad hoc.\n- Keep `RestartSeed`/`HistoryPrefix` semantics unchanged, but make backing object resolution guaranteed by construction.\n\nWhy this is cleaner:\n- Removes implicit dependency on post-hoc hash priming.\n- Makes state lineage explicit and easier to reason about.\n- Reduces risk of future missing-hash failures when prefix behavior evolves.\n\n## Scope\n- Introduce a first-class store fork/clone handoff at reference -\u003e rerun boundary.\n- Use forked store for all prefixed restart phases.\n- Retain existing closed-loop v0 behavior and APIs.\n- Keep output artifacts unchanged (reference/rerun dumps + stats).\n\n## Acceptance Criteria\n- Repro command above consistently writes both `reference` and `rerun` dumps.\n- `--parallel-processes` mode also writes both dumps without hash resolution failures.\n- `RestartSeed` + `HistoryPrefix` path exercises continue to work with no priming-specific code path required.\n- Regression test(s) cover prefixed rerun dumpability under forked-store design.\n- No external CLI/API changes required for users.\n\n## Notes\nThis task is architectural cleanup/simplification after validating closed-loop v0 behavior.\n","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T09:40:35.605268-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T09:40:35.605268-08:00","labels":["analysis","closed-loop","restart","trace-store"]} +{"id":"kamera-2o2","title":"Investigate subtree completion soundness regression in Crossplane example","description":"## Problem\nThe Crossplane example currently behaves differently based on optimization settings: it only produces useful exploration dumps when `subtreeCompletion` is disabled. This implies a soundness issue in subtree completion / diamond pruning behavior, because example correctness should not depend on this optimization being on/off.\n\n## Context From Current Investigation\n- Running the Crossplane example with default optimizations produced only user-action steps and no controller-progress dumps.\n- In this mode, phase dump writing saw no converged/aborted states (`res.ConvergedStates + res.AbortedStates == 0`).\n- Stats observed in this mode included `SubtreeDiamondSkips: 1`, `AbortedPaths: 0`, and visited depths only `0..3`.\n- When run with explore config:\n ```json\n {\n \"optimizations\": {\n \"subtreeCompletion\": false\n }\n }\n ```\n dumps appeared reliably and exploration progressed as expected.\n\n## Why This Is P0\nOptimization toggles should not change soundness. If subtree completion can suppress terminal-state recording or truncate exploration in valid runs, results are not trustworthy.\n\n## Scope\nUse the Crossplane example as the primary reproducer to deeply understand root cause in subtree completion logic, state dedupe/diamond detection, and terminal-state accounting.\n\n## Repro Starting Point\n- Example: `examples/crossplane`\n- Compare behavior with subtree completion enabled vs disabled via explore config.\n- Inspect `pkg/explore/parallel_runner.go` and tracecheck optimization paths tied to subtree completion/diamond skipping.\n\n## Acceptance Criteria\n- Root cause identified and documented (not just symptom).\n- Reproducer captured in issue notes with exact command/config.\n- Clear fix plan (or fix) that preserves soundness while retaining optimization benefits where valid.","status":"open","priority":0,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-02-25T17:43:53.172152-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:43:53.172152-08:00","labels":["crossplane","subtree-completion","tracecheck"]} {"id":"kamera-2wc","title":"Translate hotspot instances into scenario seeds","description":"Define rules to map hotspot instances (controllers/resources) into initial StateNode objects and ExploreOptions (permutation/stale scopes), including GVK-\u003eobject templates.","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-27T17:16:24.806448-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-27T17:16:24.806448-08:00"} {"id":"kamera-369","title":"DAG divergence analysis should consider pending reconcilers, not just ContentsHash","description":"Current LCA-based divergence detection uses ContentsHash which only considers objects. But paths can 'fan in' to same ContentsHash with different pending reconcilers, masking the true divergence point. Trial-1 shows: at 20nh30fc, State 0 has no EndpointsController pending while State 1 does - this is the real cause of divergence, not the ordering at 20nh30fc.","status":"open","priority":1,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T22:33:13.902039-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T22:33:13.902039-08:00","dependencies":[{"issue_id":"kamera-369","depends_on_id":"kamera-xw8","type":"blocks","created_at":"2026-01-13T22:43:57.7137-08:00","created_by":"Tim Goodwin"}],"comments":[{"id":1,"issue_id":"kamera-369","author":"Tim Goodwin","text":"Analysis of trial-1 shows the issue clearly:\n\n**Paths reach same ContentsHash (20nh30fc) with different pending lists:**\n- State 0: EndpointsController ran at step 33, not re-enqueued by step 39\n- State 1: EndpointsController ran at 30-31, but ServerlessServiceReconciler at step 33 re-enqueued it\n\n**Root cause pattern:**\n1. Different execution order causes different 'trigger' events\n2. Same objects reached, but different pending reconcilers\n3. DAG collapses these as same node (same ContentsHash)\n4. Actual divergence is masked - appears as ordering choice but is actually about what's pending\n\n**Implication:**\nThe divergence point in ContentsHash-based DAG is misleading. True divergence happens earlier when execution order affected which reconcilers got enqueued. Need full StateHash (objects + pending) for accurate divergence detection.","created_at":"2026-01-14T06:34:10Z"},{"id":2,"issue_id":"kamera-369","author":"Tim Goodwin","text":"This issue (pending-aware divergence) is related to but distinct from kamera-xw8 (field-level tracing). \n\nkamera-xw8 may be the better approach - rather than trying to build a perfect DAG that captures 'true' divergence, trace backwards from the actual differences in terminal states to find when they became inevitable.","created_at":"2026-01-14T06:43:57Z"}]} +{"id":"kamera-3d7","title":"Investigate Crossplane non-idempotency causing unbounded path growth","description":"## Problem\nCrossplane exploration appears non-idempotent and does not converge under current input/setup, with paths growing until depth cap abort.\n\n## Context From Current Investigation\n- At depth 20, observed path length 21 (aborted at cap).\n- At depth 100, observed path length 101.\n- At depth 500, observed path length 501.\n- This strongly suggests repeated non-idempotent reconcile behavior (or equivalent no-op writes triggering further work).\n\n## Requested Investigation\nPerform focused analysis at depth 50 to keep artifacts manageable while preserving signal.\n\n## Required Work\n1. Inspect Crossplane example inputs (externalized workflow JSON) to identify resource fields/status that may drive repeated updates.\n2. Inspect Crossplane source/controller behavior to determine which reconcile path is repeatedly mutating state or enqueueing work.\n3. Correlate dump steps with controller actions and object deltas to identify the specific non-idempotent write pattern.\n\n## Why This Is P0\nIf controller activity is effectively non-idempotent in exploration, coverage/path metrics and convergence-based reasoning are unreliable.\n\n## Acceptance Criteria\n- Concrete root-cause hypothesis tied to specific controller/object field(s).\n- Evidence from depth-50 run (step/delta references).\n- Recommended mitigation path (example/input adjustment, model correction, or controller-behavior handling in harness).","status":"open","priority":0,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-25T17:44:01.07775-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:44:01.07775-08:00","labels":["crossplane","nonidempotency","tracecheck"]} {"id":"kamera-3hr","title":"Restructure CompositeKey/dump serialization to preserve Kind/Group without post-processing","description":"## Problem\n\nCompositeKey embeds both IdentityKey and ResourceKey, which both have Kind and Group fields. Go's encoding/json silently drops duplicate field names when serializing embedded structs, so only ObjectID, Namespace, and Name were being serialized.\n\nWe added workarounds:\n1. Custom MarshalJSON/UnmarshalJSON with explicit field names\n2. Backwards-compatible legacy format parsing \n3. Post-load enrichment from object data (extracting Kind from apiVersion/kind fields)\n\nThis is fragile and complex.\n\n## Proposed Solutions\n\nConsider restructuring upstream of dump serialization:\n\n1. **Don't embed structs in CompositeKey** - use named fields instead:\n ```go\n type CompositeKey struct {\n Identity IdentityKey\n Resource ResourceKey\n }\n ```\n This would require updating all call sites that access `key.Kind` to `key.Resource.Kind`.\n\n2. **Add JSON tags to embedded structs** - prefix field names to avoid collision:\n ```go\n type IdentityKey struct {\n Group string `json:\"identityGroup\"`\n Kind string `json:\"identityKind\"`\n ObjectID string `json:\"objectId\"`\n }\n ```\n But this changes the JSON format everywhere IdentityKey is used.\n\n3. **Consolidate Kind/Group into one location** - maybe CompositeKey shouldn't have Kind in both embedded structs. Review why both IdentityKey and ResourceKey need Kind.\n\n## Acceptance Criteria\n\n- [ ] CompositeKey serializes/deserializes correctly without custom Marshal/Unmarshal methods\n- [ ] No post-load enrichment needed\n- [ ] Backwards compatibility with existing dump files (migration path)","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T16:58:25.962755-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T16:58:25.962755-08:00"} {"id":"kamera-4sr","title":"Find RevisionReconciler source in gomodcache","description":"Locate Knative serving RevisionReconciler code in ~/tmp/gomodcache to understand Active condition logic","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T21:30:33.710132-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T21:36:51.310185-08:00","closed_at":"2026-01-13T21:36:51.310185-08:00","close_reason":"Found RevisionReconciler source in knative.dev/serving@v0.46.5/pkg/reconciler/revision/. Key files: revision.go (main reconciler), reconcile_resources.go (reconcileDeployment, reconcilePA), revision_lifecycle.go (condition management). Active condition is set by PropagateAutoscalerStatus in reconcilePA."} {"id":"kamera-53o","title":"Add Watch for EndpointsController to trigger on Pod changes","description":"Root cause of Endpoints divergence: EndpointsController is not triggered when Pods change (e.g., become Ready). In real K8s, EndpointsController uses Watches(\u0026Pod{}, mapPodToServices). In kamera, WithResourceDepGK(Pod, EndpointsController) only declares deps for stuck detection, NOT triggering. Fix: Add .Watches('Pod', mapPodToServices) to EndpointsController registration in explorebuilder.go where mapPodToServices enqueues all Services in the Pod's namespace.","status":"open","priority":0,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:34:36.185422-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:34:36.185422-08:00"} @@ -37,7 +39,7 @@ {"id":"kamera-eja","title":"Last write analysis for diverging objects","description":"**Module 1 of backward-trace divergence analysis framework (kamera-7tl)**\n\n## Purpose\nAnswer: 'What did the reconciler see when it wrote this object's final value?'\n\n## Input\n- dump.jsonl file path\n- List of ObjectDiff from Module 0 (or manually specified object keys)\n\n## Algorithm (per differing object, per path)\n1. Get the final hash for this object in this path's converged state\n2. Walk backwards through the path's steps\n3. Find the step where this object's hash in stateAfter first equals the final hash\n4. That step's controllerId is the 'last writer'\n5. Capture stateBefore from that step as 'what the reconciler saw'\n\n## Output Struct\n```go\ntype LastWriteAnalysis struct {\n Object ObjectKey\n ByPath []PathLastWrite\n}\n\ntype PathLastWrite struct {\n PathIndex int\n StateID string\n FinalHash VersionHash\n LastWriteStep LastWriteStep\n}\n\ntype LastWriteStep struct {\n StepIndex int\n ControllerId string\n StateBefore []ObjectVersion // what reconciler saw\n}\n```\n\n## Human-readable Output Example\n```\nEndpoints/kamera-test/kamera-test:\n\n Path 0 (→ state pdu8xy8e):\n Last write: step 25 by EndpointsController\n Reconciler saw: Pod.status.phase=Running, Pod.Ready=false\n\n Path 1 (→ state 15wjkorq):\n Last write: step 40 by EndpointsController \n Reconciler saw: Pod.status.phase=Running, Pod.Ready=true\n```\n\n## Key Feature\nHighlight the DIFF between what reconciler saw across paths, focusing attention on why outputs differed.\n\n## Location\npkg/analysis/lastwrite.go","status":"closed","priority":2,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T14:06:48.315761-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T15:58:30.287386-08:00","closed_at":"2026-01-14T15:58:30.287386-08:00","close_reason":"Implemented in commits 6fae4a5 through a5a4a0a","dependencies":[{"issue_id":"kamera-eja","depends_on_id":"kamera-189","type":"blocks","created_at":"2026-01-14T14:08:15.391857-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-eo8","title":"Improve dump.jsonl effects with type and key fields","description":"Improve dump.jsonl effects field structure for easier parsing:\n- Effects have OpType, Key, Version but analysis scripts expected lowercase 'type', 'key'\n- Document the actual field names in dump format\n- Consider adding convenience fields like 'kind' directly in Key\n- Add object kind to effect entries for quick filtering without hash lookup\n\nRelated: kamera-pwm (converged state diff), kamera-bl9 (divergence finder)","status":"open","priority":3,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T11:36:38.78133-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:28:01.28713-08:00"} {"id":"kamera-f70","title":"Infer PerturbationProfiles directly from graph patterns","description":"Revise static analysis output so graph pattern detection produces []PerturbationProfile directly (not []HotspotInstance).\n\nRequirements:\n1) consume the explicit graph contract (edge semantics + canonical IDs),\n2) avoid assuming legacy \"reconciles\" edges are always present; use the contract's trigger semantics,\n3) preserve existing pattern classes (multi-writer, missing trigger, diamond, reducer, feedback cycle),\n4) emit profiles that are directly usable by baseline-scan-branch orchestration,\n5) update cmd/generate output accordingly.\n\nThe HotspotInstance -\u003e TranslateHotspots -\u003e coverage.Input translation path is superseded.\n","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-11T22:06:59.140368-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:20:09.826674-08:00","dependencies":[{"issue_id":"kamera-f70","depends_on_id":"kamera-kse","type":"blocks","created_at":"2026-02-11T22:07:30.187388-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-f70","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-11T22:08:37.571304-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-f70","depends_on_id":"kamera-yqn.1","type":"blocks","created_at":"2026-02-12T14:21:30.450738-08:00","created_by":"Tim Goodwin"}]} -{"id":"kamera-flx","title":"examples/crossplane go test fails due to controller-runtime signature mismatch","description":"Running 'go test ./...' in examples/crossplane fails to build crossplane/internal/engine: engine.go:304:31 cannot use kcontroller.NewUnmanaged as NewControllerFn. This blocks verifying test failure reason for reconciler kind mapping. Repro: cd examples/crossplane \u0026\u0026 go test ./...","status":"open","priority":2,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-02-01T16:31:57.094194-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-01T16:31:57.094194-08:00"} +{"id":"kamera-flx","title":"examples/crossplane go test fails due to controller-runtime signature mismatch","description":"Running 'go test ./...' in examples/crossplane fails to build crossplane/internal/engine: engine.go:304:31 cannot use kcontroller.NewUnmanaged as NewControllerFn. This blocks verifying test failure reason for reconciler kind mapping. Repro: cd examples/crossplane \u0026\u0026 go test ./...","status":"closed","priority":2,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-02-01T16:31:57.094194-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:43:38.168496-08:00","closed_at":"2026-02-25T17:43:38.168496-08:00","close_reason":"Closed"} {"id":"kamera-fv4","title":"Baseline trace contract audit and targeted gaps","description":"Audit and formalize the baseline trace contract required by checkpoint scanning.\n\nCurrent code already captures rich per-step data (state before/after, effects, pending set, mutation history). This task should:\n1) document exactly which existing fields are available and sufficient for scanner predicates,\n2) define the minimal trace contract the scanner depends on,\n3) add trace fields only if a concrete scanner requirement is truly missing,\n4) add focused tests to prevent regressions in trace contents relied upon by scanner logic.\n\nGoal: avoid redoing tracing infrastructure; target only real gaps.\n","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-11T22:06:59.152113-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:19:38.975464-08:00","dependencies":[{"issue_id":"kamera-fv4","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-11T22:08:37.480705-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-g4j","title":"Refactor stale view branching to use subtree completion optimization","description":"## Context\n\nIn `pkg/tracecheck/explore.go`, there are two types of state branching:\n\n1. **Order branching** (lines 956-1030): Uses subtree completion optimization\n2. **Stale view branching** (lines 763-1031): Currently inline, no subtree optimization\n\n## Current Behavior\n\n**Order branching** works well:\n- `expandStateByReconcileOrder` generates ordering variants\n- All variants pushed onto stack with a marker via `enqueueStates`\n- Marker pops when entire subtree is done → marks logical state as completed\n- Future encounters of same logical state skip entirely\n\n**Stale view branching** is inline:\n```go\nfor _, stateView := range possibleViews {\n // ... reconcile step for each view\n // ... enqueue resulting states\n}\n```\nThis processes views sequentially in the same stack frame, missing the optimization.\n\n## Proposed Change\n\nRefactor stale view expansion to follow the order branching pattern:\n\n1. Instead of inline loop, push all `possibleViews` onto stack with a marker\n2. Each view becomes a separate stack entry processed in DFS order\n3. When marker pops, mark the staleness branch as completed\n4. Future encounters of same staleness expansion can skip\n\n## Key Insight\n\nThe stack's LIFO behavior with markers lets us detect subtree completion:\n```\nPush: [Marker] [View3] [View2] [View1]\nPop order: View1 → View2 → View3 → Marker (signals completion)\n```\n\n## Considerations\n\n- Need a `LogicalStateKey` equivalent for stale view branches\n- May need to combine with order branching (stale views × orderings)\n- Ensure no regression in exploration completeness","status":"closed","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-15T14:33:08.377019-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T09:21:58.606606-08:00","closed_at":"2026-01-20T15:13:22.297077-08:00"} {"id":"kamera-gwc","title":"Create object lifecycle tracker for dump analysis","description":"**Module 2 of backward-trace divergence analysis framework (kamera-7tl)**\n\n## Purpose\nAnswer: 'Does this object take on this value at some point in this path?'\n\n## Input\n- dump.jsonl file path\n- Path index to analyze\n- Object key + target hash\n\n## Algorithm\n1. Walk through all steps in the specified path\n2. Record all steps where the object has the target hash in stateAfter\n\n## Output Struct\n```go\ntype ObjectLifecycleResult struct {\n Object ObjectKey\n TargetHash VersionHash\n PathIndex int\n Appearances []StepInfo // all steps where object had this hash (empty if never)\n}\n\ntype StepInfo struct {\n StepIndex int\n ControllerId string // who produced this state\n}\n```\n\n## Human-readable Output Example\n```\nObject: Pod/kamera-test/kamera-test-xyz\nHash: abc123 (Ready=true)\nPath: 0\n\nAppearances:\n - step 38: PodLifecycleController\n - step 45: PodLifecycleController (still has this value)\n```\n\n## Diagnostic Value\nCombined with Module 1: if the 'missing' state appears later in the path but the reconciler didn't write again, it points to a watch/trigger issue.\n\n## Location\npkg/analysis/lifecycle.go","status":"closed","priority":3,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:34.707133-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T15:58:30.288649-08:00","closed_at":"2026-01-14T15:58:30.288649-08:00","close_reason":"Implemented in commits 6fae4a5 through a5a4a0a","dependencies":[{"issue_id":"kamera-gwc","depends_on_id":"kamera-189","type":"blocks","created_at":"2026-01-14T14:08:15.465183-08:00","created_by":"Tim Goodwin"}]} @@ -74,5 +76,5 @@ {"id":"kamera-yqn.2","title":"Author LLM workflow for dependency-graph/schema-map generation","description":"Produce extremely explicit instructions/prompts for LLM-assisted analysis of controller codebases (Knative, Crossplane, Kratix, Karpenter) to generate graph artifacts that satisfy the v2 contract. Include: required evidence format from source code, edge extraction rules, role annotation rules, ambiguity handling, and mandatory self-checks. Also add a verification checklist against local code (including spot checks against controller setup/watches/reads/writes and schema-map coverage) so generated artifacts are auditable and reproducible.","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-12T14:18:07.01988-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:18:07.01988-08:00","dependencies":[{"issue_id":"kamera-yqn.2","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-12T14:18:07.021287-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-yqn.2","depends_on_id":"kamera-yqn.1","type":"blocks","created_at":"2026-02-12T14:21:10.047952-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-yqn.3","title":"Unify restart semantics across interactive and parallel runners","description":"Generalize restart execution so both single-run and parallel-run code paths share the same restart contract and behavior. Specifically ensure RestartRequest semantics (seed + config + optional execution-history prefix/preserve-history intent) are supported consistently in both interactive subtree restarts and non-interactive/parallel branching. Deliverables: shared restart application helper/API, parity tests, and explicit documentation of what history/prefix is preserved in dumps/results.","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-12T14:18:24.53152-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:18:24.53152-08:00","dependencies":[{"issue_id":"kamera-yqn.3","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-12T14:18:24.532845-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-yqn.4","title":"Refresh coverage-strategy-v2 design doc against current simulator capabilities","description":"Revise docs/design/coverage-strategy-v2.md to explicitly account for the current state of Kamera: existing trace richness (stateBefore/stateAfter/pending/effects), restart primitives, and determinization/normalization already implemented in simulator tooling. Clarify what is truly missing vs already available, and update open questions/assumptions accordingly so implementation tasks target real gaps instead of redoing completed infrastructure.","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-12T14:18:42.931604-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:18:42.931604-08:00","dependencies":[{"issue_id":"kamera-yqn.4","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-12T14:18:42.932846-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-yqn.4","depends_on_id":"kamera-fv4","type":"blocks","created_at":"2026-02-12T14:22:01.107846-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-yqn.4","depends_on_id":"kamera-yqn.1","type":"blocks","created_at":"2026-02-12T14:22:11.268119-08:00","created_by":"Tim Goodwin"}]} -{"id":"kamera-z38","title":"Investigate informer cache timing in RevisionReconciler","description":"The RevisionReconciler's reconcilePA phase uses deploymentLister to fetch the deployment created by reconcileDeployment. Sometimes the lister doesn't see the newly created deployment, causing reconcilePA to return early without creating the PA. This results in nondeterministic exploration where some paths create PA and some don't.\n","status":"open","priority":1,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T11:42:19.395997-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-20T14:17:20.301217-08:00"} +{"id":"kamera-z38","title":"Investigate informer cache timing in RevisionReconciler","description":"The RevisionReconciler's reconcilePA phase uses deploymentLister to fetch the deployment created by reconcileDeployment. Sometimes the lister doesn't see the newly created deployment, causing reconcilePA to return early without creating the PA. This results in nondeterministic exploration where some paths create PA and some don't.\n","status":"closed","priority":1,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T11:42:19.395997-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:43:47.720764-08:00","closed_at":"2026-02-25T17:43:47.720764-08:00","close_reason":"Closed"} {"id":"kamera-znd","title":"Create causal chain tracer for divergence analysis","description":"**Superseded by kamera-7tl (Backward-trace divergence analysis framework)**\n\nThe original vision for this task has been refined into a composable module architecture. The 'causal chain tracing' is now achieved by composing:\n\n1. Module 0 (kamera-pwm): Identify what differs in final states\n2. Module 1 (kamera-eja): Find last writes and input diffs\n3. Module 2 (kamera-gwc): Check if 'missing' state appears later\n\nSee kamera-7tl for the full design.\n\n## Original Description (preserved)\nBuild a tool that traces how an initial divergence cascades to final state differences:\n- Given a divergence point, track how the difference propagates\n- Show which reconcilers are affected by the divergence\n- Identify when/if paths reconverge on specific objects\n- Produce a summary showing: initial cause -\u003e intermediate effects -\u003e final outcome","status":"open","priority":2,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:34.573893-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T14:07:23.346227-08:00"} diff --git a/examples/crossplane/go.mod b/examples/crossplane/go.mod index e91ef86..4493b04 100644 --- a/examples/crossplane/go.mod +++ b/examples/crossplane/go.mod @@ -3,8 +3,8 @@ module github.com/crossplane/crossplane/v2/examples/crossplane go 1.24.0 require ( - github.com/crossplane/crossplane-runtime/v2 v2.1.0-rc.0 - github.com/crossplane/crossplane/v2 v2.0.0 + github.com/crossplane/crossplane-runtime/v2 v2.1.0 + github.com/crossplane/crossplane/v2 v2.1.0 github.com/tgoodwin/kamera v0.0.0 google.golang.org/protobuf v1.36.11 k8s.io/api v0.34.1 @@ -28,7 +28,7 @@ require ( github.com/gdamore/encoding v1.0.1 // indirect github.com/gdamore/tcell/v2 v2.8.1 // indirect github.com/go-ini/ini v1.67.0 // indirect - github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/go-logr/zerologr v1.2.3 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect @@ -59,7 +59,7 @@ require ( github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pkg/errors v0.9.1 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_golang v1.22.0 // indirect github.com/prometheus/client_model v0.6.1 // indirect github.com/prometheus/common v0.62.0 // indirect @@ -78,23 +78,23 @@ require ( github.com/tidwall/sjson v1.2.5 // indirect github.com/wI2L/jsondiff v0.6.1 // indirect github.com/x448/float16 v0.8.4 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect + go.opentelemetry.io/otel v1.36.0 // indirect + go.opentelemetry.io/otel/trace v1.36.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect go.yaml.in/yaml/v2 v2.4.2 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.37.0 // indirect + golang.org/x/crypto v0.39.0 // indirect golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa // indirect - golang.org/x/mod v0.24.0 // indirect - golang.org/x/net v0.39.0 // indirect - golang.org/x/oauth2 v0.29.0 // indirect - golang.org/x/sync v0.13.0 // indirect - golang.org/x/sys v0.32.0 // indirect - golang.org/x/term v0.31.0 // indirect - golang.org/x/text v0.24.0 // indirect + golang.org/x/mod v0.25.0 // indirect + golang.org/x/net v0.41.0 // indirect + golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sync v0.15.0 // indirect + golang.org/x/sys v0.33.0 // indirect + golang.org/x/term v0.32.0 // indirect + golang.org/x/text v0.26.0 // indirect golang.org/x/time v0.11.0 // indirect - golang.org/x/tools v0.32.0 // indirect + golang.org/x/tools v0.34.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect google.golang.org/grpc v1.72.1 // indirect @@ -119,5 +119,3 @@ require ( ) replace github.com/tgoodwin/kamera => ../.. - -replace github.com/crossplane/crossplane/v2 => /Users/tgoodwin/projects/crossplane diff --git a/examples/crossplane/go.sum b/examples/crossplane/go.sum index 88f26fc..56ea164 100644 --- a/examples/crossplane/go.sum +++ b/examples/crossplane/go.sum @@ -12,8 +12,10 @@ github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UF github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= -github.com/crossplane/crossplane-runtime/v2 v2.1.0-rc.0 h1:T9KV7XKWCNVT7KAvUWPBGZVgO22YKvjDR9vWq1uyaFg= -github.com/crossplane/crossplane-runtime/v2 v2.1.0-rc.0/go.mod h1:pkd5UzmE8esaZAApevMutR832GjJ1Qgc5Ngr78ByxrI= +github.com/crossplane/crossplane-runtime/v2 v2.1.0 h1:JBMhL9T+/PfyjLAQEdZWlKLvA3jJVtza8zLLwd9Gs4k= +github.com/crossplane/crossplane-runtime/v2 v2.1.0/go.mod h1:j78pmk0qlI//Ur7zHhqTr8iePHFcwJKrZnzZB+Fg4t0= +github.com/crossplane/crossplane/v2 v2.1.0 h1:7RFel1CeIWQytwtHz1st7lZiycYXUIcS0LkaetCJa4Q= +github.com/crossplane/crossplane/v2 v2.1.0/go.mod h1:6ZIDSUjM22hmHC3szqh/dM17LHI+vgVFMUdP+UmEoXA= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= @@ -38,8 +40,8 @@ github.com/gdamore/tcell/v2 v2.8.1 h1:KPNxyqclpWpWQlPLx6Xui1pMk8S+7+R37h3g07997N github.com/gdamore/tcell/v2 v2.8.1/go.mod h1:bj8ori1BG3OYMjmb3IklZVWfZUJ1UBQt9JXrOCOhGWw= github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A= github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8= -github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= -github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= +github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= @@ -134,8 +136,9 @@ github.com/onsi/gomega v1.37.0 h1:CdEG8g0S133B4OswTDC/5XPSzE1OeP29QOioj2PID2Y= github.com/onsi/gomega v1.37.0/go.mod h1:8D9+Txp43QWKhM24yyOBEdpkzN8FvJyAwecBgsU4KU0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= +github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.22.0 h1:rb93p9lokFEsctTys46VnV1kLCDpVZ0a/Y92Vm0Zc6Q= github.com/prometheus/client_golang v1.22.0/go.mod h1:R7ljNsLXhuQXYZYtw6GAE9AZg8Y7vEW5scdCXrWRXC0= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= @@ -196,16 +199,16 @@ github.com/yuin/goldmark v1.3.5/go.mod h1:mwnBkeHKe2W/ZEtQ+71ViKU8L12m81fl3OWwC1 github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= go.opentelemetry.io/auto/sdk v1.1.0 h1:cH53jehLUN6UFLY71z+NDOiNJqDdPRaXzTel0sJySYA= go.opentelemetry.io/auto/sdk v1.1.0/go.mod h1:3wSPjt5PWp2RhlCcmmOial7AvC4DQqZb7a7wCow3W8A= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= +go.opentelemetry.io/otel v1.36.0 h1:UumtzIklRBY6cI/lllNZlALOF5nNIzJVb16APdvgTXg= +go.opentelemetry.io/otel v1.36.0/go.mod h1:/TcFMXYjyRNh8khOAO9ybYkqaDBb/70aVwkNML4pP8E= +go.opentelemetry.io/otel/metric v1.36.0 h1:MoWPKVhQvJ+eeXWHFBOPoBOi20jh6Iq2CcCREuTYufE= +go.opentelemetry.io/otel/metric v1.36.0/go.mod h1:zC7Ks+yeyJt4xig9DEw9kuUFe5C3zLbVjV2PzT6qzbs= go.opentelemetry.io/otel/sdk v1.34.0 h1:95zS4k/2GOy069d321O8jWgYsW3MzVV+KuSPKp7Wr1A= go.opentelemetry.io/otel/sdk v1.34.0/go.mod h1:0e/pNiaMAqaykJGKbi+tSjWfNNHMTxoC9qANsCzbyxU= go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= +go.opentelemetry.io/otel/trace v1.36.0 h1:ahxWNuqZjpdiFAyrIoQ4GIiAIhxAunQR6MUoKrsNd4w= +go.opentelemetry.io/otel/trace v1.36.0/go.mod h1:gQ+OnDZzrybY4k4seLzPAWNwVBBVlF2szhehOBB/tGA= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= @@ -223,8 +226,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= -golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE= -golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc= +golang.org/x/crypto v0.39.0 h1:SHs+kF4LP+f+p14esP5jAoDpHU8Gu/v9lFRK6IT5imM= +golang.org/x/crypto v0.39.0/go.mod h1:L+Xg3Wf6HoL4Bn4238Z6ft6KfEpN0tJGo53AAPC632U= golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa h1:ELnwvuAXPNtPk1TJRuGkI9fDTwym6AYBu0qzT8AcHdI= golang.org/x/exp v0.0.0-20240808152545-0cdaa3abc0fa/go.mod h1:akd2r19cwCdwSwWeIdzYQGa/EZZyqcOdwWiwj5L5eKQ= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= @@ -235,8 +238,8 @@ golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= -golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU= -golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= +golang.org/x/mod v0.25.0 h1:n7a+ZbQKQA/Ysbyb0/6IbB1H/X41mKgbhfv7AfG/44w= +golang.org/x/mod v0.25.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -249,10 +252,10 @@ golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= -golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY= -golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E= -golang.org/x/oauth2 v0.29.0 h1:WdYw2tdTK1S8olAzWHdgeqfy+Mtm9XNhv/xJsY65d98= -golang.org/x/oauth2 v0.29.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= +golang.org/x/net v0.41.0 h1:vBTly1HeNPEn3wtREYfy4GZ/NECgw2Cnl+nK6Nz3uvw= +golang.org/x/net v0.41.0/go.mod h1:B/K4NNqkfmg07DQYrbwvSluqCJOOXwUjeb/5lOisjbA= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -263,8 +266,8 @@ golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610= -golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= +golang.org/x/sync v0.15.0 h1:KWH3jNZsfyT6xfAfKiz6MRNmd46ByHDYaZ7KSkCtdW8= +golang.org/x/sync v0.15.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -282,8 +285,8 @@ golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.29.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20= -golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/sys v0.33.0 h1:q3i8TbbEz+JRD9ywIRlyRAQbM0qF7hu24q3teo2hbuw= +golang.org/x/sys v0.33.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= @@ -293,8 +296,8 @@ golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= golang.org/x/term v0.28.0/go.mod h1:Sw/lC2IAUZ92udQNf3WodGtn4k/XoLyZoh8v/8uiwek= -golang.org/x/term v0.31.0 h1:erwDkOK1Msy6offm1mOgvspSkslFnIGsFnxOKoufg3o= -golang.org/x/term v0.31.0/go.mod h1:R4BeIy7D95HzImkxGkTW1UQTtP54tio2RyHz7PwK0aw= +golang.org/x/term v0.32.0 h1:DR4lr0TjUs3epypdhTOkMmuF5CDFJ/8pOnbzMZPQ7bg= +golang.org/x/term v0.32.0/go.mod h1:uZG1FhGx848Sqfsq4/DlJr3xGGsYMu/L5GW4abiaEPQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= @@ -304,8 +307,8 @@ golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= -golang.org/x/text v0.24.0 h1:dd5Bzh4yt5KYA8f9CJHCP4FB4D51c2c6JvN37xJJkJ0= -golang.org/x/text v0.24.0/go.mod h1:L8rBsPeo2pSS+xqN0d5u2ikmjtmoJbDBT1b7nHvFCdU= +golang.org/x/text v0.26.0 h1:P42AVeLghgTYr4+xUnTRKDMqpar+PtX7KWuNQL21L8M= +golang.org/x/text v0.26.0/go.mod h1:QK15LZJUUQVJxhz7wXgxSy/CJaTFjd0G+YLonydOVQA= golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -317,8 +320,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= -golang.org/x/tools v0.32.0 h1:Q7N1vhpkQv7ybVzLFtTjvQya2ewbwNDZzUgfXGqtMWU= -golang.org/x/tools v0.32.0/go.mod h1:ZxrU41P/wAbZD8EDa6dDCa6XfpkhJ7HFMjHJXfBDu8s= +golang.org/x/tools v0.34.0 h1:qIpSLOxeCYGg9TrcJokLBG4KFA6d795g0xkBkiESGlo= +golang.org/x/tools v0.34.0/go.mod h1:pAP9OwEaY1CAW3HOmg3hLZC5Z0CCmzjAF2UQMSqNARg= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= diff --git a/examples/crossplane/scenario.go b/examples/crossplane/scenario.go index 6a02ebf..4a4e0f2 100644 --- a/examples/crossplane/scenario.go +++ b/examples/crossplane/scenario.go @@ -2,8 +2,9 @@ package main import ( "fmt" + "strings" - "github.com/crossplane/crossplane-runtime/v2/pkg/event" + xpevent "github.com/crossplane/crossplane-runtime/v2/pkg/event" "github.com/crossplane/crossplane-runtime/v2/pkg/logging" "github.com/crossplane/crossplane-runtime/v2/pkg/resource/fake" ucomposite "github.com/crossplane/crossplane-runtime/v2/pkg/resource/unstructured/composite" @@ -15,6 +16,7 @@ import ( "github.com/crossplane/crossplane/v2/internal/controller/apiextensions/revision" "github.com/tgoodwin/kamera/pkg/coverage" + "github.com/tgoodwin/kamera/pkg/event" "github.com/tgoodwin/kamera/pkg/explore" "github.com/tgoodwin/kamera/pkg/tag" "github.com/tgoodwin/kamera/pkg/tracecheck" @@ -67,7 +69,7 @@ func newCrossplaneExplorerBuilder() *tracecheck.ExplorerBuilder { composite.WithLogger(log), composite.WithRecorder(recorder), ) - }).For(xrAPIVersion + "/" + xrKind) + }).ForGK(schema.GroupKind{Group: "example.org", Kind: xrKind}) return builder } @@ -153,11 +155,11 @@ type logRecorder struct { annotations map[string]string } -func newLogRecorder(log logging.Logger) event.Recorder { +func newLogRecorder(log logging.Logger) xpevent.Recorder { return logRecorder{log: log, annotations: map[string]string{}} } -func (r logRecorder) Event(obj runtime.Object, e event.Event) { +func (r logRecorder) Event(obj runtime.Object, e xpevent.Event) { fields := []any{ "type", string(e.Type), "reason", string(e.Reason), @@ -186,7 +188,7 @@ func (r logRecorder) Event(obj runtime.Object, e event.Event) { r.log.Info("Crossplane event", fields...) } -func (r logRecorder) WithAnnotations(keysAndValues ...string) event.Recorder { +func (r logRecorder) WithAnnotations(keysAndValues ...string) xpevent.Recorder { next := logRecorder{ log: r.log, annotations: copyAnnotations(r.annotations), @@ -221,6 +223,156 @@ func copyAnnotations(in map[string]string) map[string]string { return out } -func scenariosFromInputs(_ *tracecheck.ExplorerBuilder, _ []coverage.Input) ([]explore.Scenario, error) { - return nil, fmt.Errorf("input to scenario conversion not implemented") +func scenariosFromInputs(builder *tracecheck.ExplorerBuilder, inputs []coverage.Input) ([]explore.Scenario, error) { + if builder == nil { + return nil, fmt.Errorf("builder is nil") + } + if len(inputs) == 0 { + return nil, fmt.Errorf("no inputs supplied") + } + + baseCfg := builder.Config() + scenarios := make([]explore.Scenario, 0, len(inputs)) + for idx, input := range inputs { + state, seededObjects, err := buildStateFromCoverageInput(builder, input) + if err != nil { + return nil, fmt.Errorf("build start state for input %d (%s): %w", idx, input.Name, err) + } + userInputs, err := buildUserActionsFromCoverageInput(input, seededObjects) + if err != nil { + return nil, fmt.Errorf("build user actions for input %d (%s): %w", idx, input.Name, err) + } + + scenarios = append(scenarios, explore.Scenario{ + Name: input.Name, + EnvironmentState: state, + UserInputs: userInputs, + Config: applyInputTuning(baseCfg, input.Tuning), + }) + } + + return scenarios, nil +} + +func buildStateFromCoverageInput(builder *tracecheck.ExplorerBuilder, input coverage.Input) (tracecheck.StateNode, []client.Object, error) { + if builder == nil { + return tracecheck.StateNode{}, nil, fmt.Errorf("builder is nil") + } + + objects := make([]client.Object, 0, len(input.EnvironmentState.Objects)) + for idx, obj := range input.EnvironmentState.Objects { + if obj == nil { + return tracecheck.StateNode{}, nil, fmt.Errorf("input environment object %d is nil", idx) + } + objects = append(objects, obj.DeepCopy()) + } + + if len(objects) == 0 { + for _, action := range input.UserInputs { + if action.Type != event.CREATE || action.Object == nil { + continue + } + objects = append(objects, action.Object.DeepCopy()) + } + } + + if len(objects) == 0 { + return tracecheck.StateNode{}, nil, fmt.Errorf("input has no seedable objects") + } + + state, err := builder.BuildStartStateFromObjects(objects, nil) + if err != nil { + return tracecheck.StateNode{}, nil, err + } + return state, objects, nil +} + +func buildUserActionsFromCoverageInput(input coverage.Input, seededObjects []client.Object) ([]tracecheck.UserAction, error) { + actions := make([]tracecheck.UserAction, 0, len(input.UserInputs)) + for idx, action := range input.UserInputs { + if action.Object == nil { + return nil, fmt.Errorf("input user input %d has nil object", idx) + } + + id := strings.TrimSpace(action.ID) + if id == "" { + id = fmt.Sprintf("user-input-%d", idx) + } + + opType := action.Type + if opType == event.CREATE && isInputObjectSeeded(action.Object, seededObjects) { + opType = event.UPDATE + } + + actions = append(actions, tracecheck.UserAction{ + ID: id, + OpType: opType, + Payload: action.Object.DeepCopy(), + }) + } + return actions, nil +} + +func isInputObjectSeeded(object client.Object, seededObjects []client.Object) bool { + if object == nil { + return false + } + for _, seeded := range seededObjects { + if sameObjectIdentity(seeded, object) { + return true + } + } + return false +} + +func sameObjectIdentity(a, b client.Object) bool { + if a == nil || b == nil { + return false + } + aGVK := a.GetObjectKind().GroupVersionKind() + bGVK := b.GetObjectKind().GroupVersionKind() + if aGVK.Group != bGVK.Group || aGVK.Kind != bGVK.Kind { + return false + } + return a.GetNamespace() == b.GetNamespace() && a.GetName() == b.GetName() +} + +func applyInputTuning(base tracecheck.ExploreConfig, tuning coverage.InputTuning) tracecheck.ExploreConfig { + cfg := base.Clone() + if tuning.MaxDepth > 0 { + cfg.MaxDepth = tuning.MaxDepth + } + if len(tuning.PermuteControllers) > 0 { + if cfg.Perturbations.PermuteOrder == nil { + cfg.Perturbations.PermuteOrder = make(map[tracecheck.ReconcilerID]bool) + } + for _, controllerID := range tuning.PermuteControllers { + cfg.Perturbations.PermuteOrder[tracecheck.ReconcilerID(controllerID)] = true + } + } + if len(tuning.StaleReads) > 0 { + if cfg.Perturbations.Staleness == nil { + cfg.Perturbations.Staleness = make(map[tracecheck.ReconcilerID]tracecheck.StalenessConfig) + } + for controllerID, kinds := range tuning.StaleReads { + id := tracecheck.ReconcilerID(controllerID) + st := cfg.Perturbations.Staleness[id] + if st.StaleReadBounds == nil { + st.StaleReadBounds = make(tracecheck.LookbackLimits) + } + for _, kind := range kinds { + trimmed := strings.TrimSpace(kind) + if trimmed == "" { + continue + } + lookback := tuning.StaleLookback[trimmed] + if lookback <= 0 { + lookback = 1 + } + st.StaleReadBounds[trimmed] = tracecheck.LookbackLimit(lookback) + } + cfg.Perturbations.Staleness[id] = st + } + } + return cfg } diff --git a/examples/crossplane/scenario_test.go b/examples/crossplane/scenario_test.go index 366de13..9b71165 100644 --- a/examples/crossplane/scenario_test.go +++ b/examples/crossplane/scenario_test.go @@ -1,19 +1,153 @@ package main import ( - "strings" "testing" + + "github.com/tgoodwin/kamera/pkg/coverage" + "github.com/tgoodwin/kamera/pkg/event" + "github.com/tgoodwin/kamera/pkg/tracecheck" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" ) -func TestScenariosFromInputsUnimplemented(t *testing.T) { - scenarios, err := scenariosFromInputs(nil, nil) +func TestScenariosFromInputsRequiresBuilder(t *testing.T) { + _, err := scenariosFromInputs(nil, []coverage.Input{{Name: "x"}}) + if err == nil { + t.Fatal("expected error, got nil") + } +} + +func TestScenariosFromInputsRequiresInputs(t *testing.T) { + builder := newCrossplaneExplorerBuilder() + _, err := scenariosFromInputs(builder, nil) if err == nil { t.Fatal("expected error, got nil") } - if !strings.Contains(err.Error(), "input to scenario conversion not implemented") { - t.Fatalf("unexpected error: %v", err) +} + +func TestScenariosFromInputsTranslatesStateActionsAndTuning(t *testing.T) { + builder := newCrossplaneExplorerBuilder() + + functionObj := mustObjectAsUnstructured(t, buildFunctionRevision()) + xr := buildCompositeResource() + + input := coverage.Input{ + Name: "crossplane-translate", + EnvironmentState: coverage.EnvironmentState{ + Objects: []*unstructured.Unstructured{functionObj}, + }, + UserInputs: []coverage.UserInput{ + { + Type: event.CREATE, + Object: xr, + }, + }, + Tuning: coverage.InputTuning{ + MaxDepth: 7, + PermuteControllers: []string{"CompositeReconciler"}, + StaleReads: map[string][]string{ + "CompositionReconciler": {"apiextensions.crossplane.io/CompositionRevision"}, + }, + StaleLookback: map[string]int{ + "apiextensions.crossplane.io/CompositionRevision": 2, + }, + }, + } + + scenarios, err := scenariosFromInputs(builder, []coverage.Input{input}) + if err != nil { + t.Fatalf("scenariosFromInputs error = %v", err) + } + if len(scenarios) != 1 { + t.Fatalf("expected 1 scenario, got %d", len(scenarios)) + } + + sc := scenarios[0] + if sc.Name != input.Name { + t.Fatalf("expected scenario name %q, got %q", input.Name, sc.Name) + } + if len(sc.EnvironmentState.Objects()) == 0 { + t.Fatalf("expected seeded state objects, got empty state") + } + if len(sc.UserInputs) != 1 { + t.Fatalf("expected 1 user action, got %d", len(sc.UserInputs)) + } + if sc.UserInputs[0].ID != "user-input-0" { + t.Fatalf("expected default action id user-input-0, got %q", sc.UserInputs[0].ID) + } + if sc.UserInputs[0].OpType != event.CREATE { + t.Fatalf("expected CREATE user action, got %q", sc.UserInputs[0].OpType) + } + payload, ok := sc.UserInputs[0].Payload.(*unstructured.Unstructured) + if !ok { + t.Fatalf("expected unstructured payload, got %T", sc.UserInputs[0].Payload) + } + if payload == xr { + t.Fatalf("expected payload deep copy, got original pointer") + } + if payload.GetKind() != "XWidget" || payload.GetName() != "example" { + t.Fatalf("unexpected payload identity: kind=%q name=%q", payload.GetKind(), payload.GetName()) + } + if sc.Config.MaxDepth != 7 { + t.Fatalf("expected MaxDepth=7, got %d", sc.Config.MaxDepth) + } + if !sc.Config.Perturbations.PermuteOrder[tracecheck.ReconcilerID("CompositeReconciler")] { + t.Fatalf("expected CompositeReconciler permutation to be enabled") + } + staleness, ok := sc.Config.Perturbations.Staleness[tracecheck.ReconcilerID("CompositionReconciler")] + if !ok { + t.Fatalf("expected staleness tuning for CompositionReconciler") + } + if staleness.StaleReadBounds["apiextensions.crossplane.io/CompositionRevision"] != 2 { + t.Fatalf("expected lookback=2, got %d", staleness.StaleReadBounds["apiextensions.crossplane.io/CompositionRevision"]) + } +} + +func TestScenariosFromInputsSeedsCreateOnlyInputs(t *testing.T) { + builder := newCrossplaneExplorerBuilder() + xr := buildCompositeResource() + + input := coverage.Input{ + Name: "crossplane-create-only", + UserInputs: []coverage.UserInput{ + { + ID: "create-xr", + Type: event.CREATE, + Object: xr, + }, + }, + } + + scenarios, err := scenariosFromInputs(builder, []coverage.Input{input}) + if err != nil { + t.Fatalf("scenariosFromInputs error = %v", err) + } + if len(scenarios) != 1 { + t.Fatalf("expected 1 scenario, got %d", len(scenarios)) + } + + sc := scenarios[0] + if len(sc.EnvironmentState.Objects()) == 0 { + t.Fatalf("expected state to be seeded from create inputs") + } + if len(sc.UserInputs) != 1 { + t.Fatalf("expected 1 user action, got %d", len(sc.UserInputs)) + } + if sc.UserInputs[0].OpType != event.UPDATE { + t.Fatalf("expected seeded CREATE to translate to UPDATE, got %q", sc.UserInputs[0].OpType) + } +} + +func mustObjectAsUnstructured(t *testing.T, obj runtime.Object) *unstructured.Unstructured { + t.Helper() + raw, err := runtime.DefaultUnstructuredConverter.ToUnstructured(obj) + if err != nil { + t.Fatalf("convert object %T to unstructured: %v", obj, err) } - if scenarios != nil { - t.Fatalf("expected nil scenarios, got %v", scenarios) + u := &unstructured.Unstructured{Object: raw} + gvk := obj.GetObjectKind().GroupVersionKind() + if gvk.Kind != "" { + u.SetGroupVersionKind(gvk) } + return u } diff --git a/examples/crossplane/two-step-workflow.json b/examples/crossplane/two-step-workflow.json new file mode 100644 index 0000000..0e7ca72 --- /dev/null +++ b/examples/crossplane/two-step-workflow.json @@ -0,0 +1,306 @@ +[ + { + "name": "crossplane-default/xr-create-then-switch-composition", + "environmentState": { + "objects": [ + { + "apiVersion": "pkg.crossplane.io/v1", + "kind": "FunctionRevision", + "metadata": { + "name": "kamera-stub-rev", + "labels": { + "pkg.crossplane.io/package": "kamera-stub" + } + }, + "spec": { + "desiredState": "Active" + }, + "status": { + "capabilities": [ + "composition" + ] + } + }, + { + "apiVersion": "apiextensions.crossplane.io/v1", + "kind": "Composition", + "metadata": { + "name": "widget-composition-v1", + "annotations": { + "workflow.kamera.dev/version": "v1" + } + }, + "spec": { + "compositeTypeRef": { + "apiVersion": "example.org/v1", + "kind": "XWidget" + }, + "mode": "Pipeline", + "pipeline": [ + { + "step": "pipeline", + "functionRef": { + "name": "kamera-stub" + } + } + ], + "writeConnectionSecretsToNamespace": "default" + } + }, + { + "apiVersion": "apiextensions.crossplane.io/v1", + "kind": "Composition", + "metadata": { + "name": "widget-composition-v2", + "annotations": { + "workflow.kamera.dev/version": "v2" + } + }, + "spec": { + "compositeTypeRef": { + "apiVersion": "example.org/v1", + "kind": "XWidget" + }, + "mode": "Pipeline", + "pipeline": [ + { + "step": "pipeline", + "functionRef": { + "name": "kamera-stub" + } + } + ], + "writeConnectionSecretsToNamespace": "crossplane-system" + } + }, + { + "apiVersion": "apiextensions.crossplane.io/v1", + "kind": "CompositionRevision", + "metadata": { + "name": "widget-composition-v1-rev-1", + "labels": { + "crossplane.io/composition-name": "widget-composition-v1" + }, + "ownerReferences": [ + { + "apiVersion": "apiextensions.crossplane.io/v1", + "kind": "Composition", + "name": "widget-composition-v1", + "uid": "", + "controller": true + } + ] + }, + "spec": { + "compositeTypeRef": { + "apiVersion": "example.org/v1", + "kind": "XWidget" + }, + "mode": "Pipeline", + "pipeline": [ + { + "step": "pipeline", + "functionRef": { + "name": "kamera-stub" + } + } + ], + "writeConnectionSecretsToNamespace": "default", + "revision": 1 + }, + "status": { + "conditions": [ + { + "type": "ValidPipeline", + "status": "True", + "reason": "ValidPipeline" + } + ] + } + }, + { + "apiVersion": "apiextensions.crossplane.io/v1", + "kind": "CompositionRevision", + "metadata": { + "name": "widget-composition-v2-rev-1", + "labels": { + "crossplane.io/composition-name": "widget-composition-v2" + }, + "ownerReferences": [ + { + "apiVersion": "apiextensions.crossplane.io/v1", + "kind": "Composition", + "name": "widget-composition-v2", + "uid": "", + "controller": true + } + ] + }, + "spec": { + "compositeTypeRef": { + "apiVersion": "example.org/v1", + "kind": "XWidget" + }, + "mode": "Pipeline", + "pipeline": [ + { + "step": "pipeline", + "functionRef": { + "name": "kamera-stub" + } + } + ], + "writeConnectionSecretsToNamespace": "crossplane-system", + "revision": 1 + }, + "status": { + "conditions": [ + { + "type": "ValidPipeline", + "status": "True", + "reason": "ValidPipeline" + } + ] + } + } + ] + }, + "tuning": { + "maxDepth": 0, + "permuteControllers": null, + "staleReads": null, + "staleLookback": null + }, + "userInputs": [ + { + "id": "create xr bound to v1 composition", + "type": "CREATE", + "object": { + "apiVersion": "example.org/v1", + "kind": "XWidget", + "metadata": { + "name": "example", + "namespace": "default" + }, + "spec": { + "compositionRef": { + "name": "widget-composition-v1" + }, + "compositionUpdatePolicy": "Automatic" + } + } + }, + { + "id": "update xr to v2 composition", + "type": "UPDATE", + "object": { + "apiVersion": "example.org/v1", + "kind": "XWidget", + "metadata": { + "name": "example", + "namespace": "default" + }, + "spec": { + "compositionRef": { + "name": "widget-composition-v2" + }, + "compositionUpdatePolicy": "Automatic" + } + } + } + ] + }, + { + "name": "crossplane-default/composition-create-then-update", + "environmentState": { + "objects": [ + { + "apiVersion": "pkg.crossplane.io/v1", + "kind": "FunctionRevision", + "metadata": { + "name": "kamera-stub-rev", + "labels": { + "pkg.crossplane.io/package": "kamera-stub" + } + }, + "spec": { + "desiredState": "Active" + }, + "status": { + "capabilities": [ + "composition" + ] + } + } + ] + }, + "tuning": { + "maxDepth": 0, + "permuteControllers": null, + "staleReads": null, + "staleLookback": null + }, + "userInputs": [ + { + "id": "create composition", + "type": "CREATE", + "object": { + "apiVersion": "apiextensions.crossplane.io/v1", + "kind": "Composition", + "metadata": { + "name": "widget-composition-rollout", + "annotations": { + "workflow.kamera.dev/version": "v1" + } + }, + "spec": { + "compositeTypeRef": { + "apiVersion": "example.org/v1", + "kind": "XWidget" + }, + "mode": "Pipeline", + "pipeline": [ + { + "step": "pipeline", + "functionRef": { + "name": "kamera-stub" + } + } + ], + "writeConnectionSecretsToNamespace": "default" + } + } + }, + { + "id": "update composition", + "type": "UPDATE", + "object": { + "apiVersion": "apiextensions.crossplane.io/v1", + "kind": "Composition", + "metadata": { + "name": "widget-composition-rollout", + "annotations": { + "workflow.kamera.dev/version": "v2" + } + }, + "spec": { + "compositeTypeRef": { + "apiVersion": "example.org/v1", + "kind": "XWidget" + }, + "mode": "Pipeline", + "pipeline": [ + { + "step": "pipeline", + "functionRef": { + "name": "kamera-stub" + } + } + ], + "writeConnectionSecretsToNamespace": "crossplane-system" + } + } + } + ] + } +] From fdbc836bdb52b070e395f9bce024b3834df01a2c Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Wed, 25 Feb 2026 18:04:21 -0800 Subject: [PATCH 24/29] add a two step workflow for kratix --- examples/kratix/flows.go | 441 ++++++++++++++++++++++--- examples/kratix/flows_test.go | 125 +++++++ examples/kratix/main.go | 62 ++-- examples/kratix/two-step-workflow.json | 119 +++++++ 4 files changed, 675 insertions(+), 72 deletions(-) create mode 100644 examples/kratix/flows_test.go create mode 100644 examples/kratix/two-step-workflow.json diff --git a/examples/kratix/flows.go b/examples/kratix/flows.go index f1791f4..8266fae 100644 --- a/examples/kratix/flows.go +++ b/examples/kratix/flows.go @@ -3,27 +3,61 @@ package main import ( "context" "fmt" + "strings" "time" "github.com/go-logr/logr" kratix "github.com/syntasso/kratix/api/v1alpha1" "github.com/syntasso/kratix/internal/controller" + "github.com/syntasso/kratix/internal/controller/controllerfakes" "github.com/syntasso/kratix/lib/compression" "github.com/syntasso/kratix/lib/writers" "github.com/tgoodwin/kamera/pkg/coverage" + "github.com/tgoodwin/kamera/pkg/event" "github.com/tgoodwin/kamera/pkg/explore" "github.com/tgoodwin/kamera/pkg/tag" "github.com/tgoodwin/kamera/pkg/tracecheck" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + fakeclientset "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset/fake" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + controllerconfig "sigs.k8s.io/controller-runtime/pkg/config" "sigs.k8s.io/controller-runtime/pkg/log/zap" ) +const ( + workControllerID tracecheck.ReconcilerID = "WorkController" + workPlacementControllerID tracecheck.ReconcilerID = "WorkPlacementController" + promiseControllerID tracecheck.ReconcilerID = "PromiseController" + promiseRevisionControllerID tracecheck.ReconcilerID = "PromiseRevisionController" +) + +const ( + workKind = "platform.kratix.io/Work" + workPlacementKind = "platform.kratix.io/WorkPlacement" + destinationKind = "platform.kratix.io/Destination" + stateStoreKind = "platform.kratix.io/BucketStateStore" + resourceBindingKind = "platform.kratix.io/ResourceBinding" + promiseKind = "platform.kratix.io/Promise" + promiseRevisionKind = "platform.kratix.io/PromiseRevision" +) + +type dynamicControllerSpec struct { + key string + controllerID tracecheck.ReconcilerID + promiseName string + gvk *schema.GroupVersionKind + crd *apiextensionsv1.CustomResourceDefinition + placeholder *controller.DynamicResourceRequestController +} + type noopScheduler struct{} func (noopScheduler) ReconcileWork(context.Context, *kratix.Work) ([]string, error) { @@ -58,22 +92,19 @@ func init() { ) } -func buildWorksFlow() (*tracecheck.ExplorerBuilder, tracecheck.StateNode, error) { +func newKratixExplorerBuilder() *tracecheck.ExplorerBuilder { scheme := runtime.NewScheme() utilruntime.Must(kratix.AddToScheme(scheme)) - + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(kratix.AddToScheme(clientgoscheme.Scheme)) ctrl.SetLogger(zap.New(zap.UseDevMode(true))) tracecheck.SetLogger(ctrl.Log.WithName("tracecheck")) - - const workControllerID = "WorkController" - const workPlacementControllerID = "WorkPlacementController" - workKind := "platform.kratix.io/Work" - workPlacementKind := "platform.kratix.io/WorkPlacement" - destinationKind := "platform.kratix.io/Destination" - stateStoreKind := "platform.kratix.io/BucketStateStore" - eb := tracecheck.NewExplorerBuilder(scheme) eb.WithMaxDepth(100) + return eb +} + +func configureWorksReconcilers(eb *tracecheck.ExplorerBuilder) { eb.WithReconciler(workControllerID, func(c ctrlclient.Client) tracecheck.Reconciler { return &controller.WorkReconciler{ Client: c, @@ -94,6 +125,57 @@ func buildWorksFlow() (*tracecheck.ExplorerBuilder, tracecheck.StateNode, error) eb.WithResourceDep(workPlacementKind, workPlacementControllerID) eb.WithResourceDep(destinationKind, workPlacementControllerID) eb.WithResourceDep(stateStoreKind, workPlacementControllerID) +} + +func configurePromisesReconcilers( + eb *tracecheck.ExplorerBuilder, + preStarted map[string]*controller.DynamicResourceRequestController, +) { + eb.WithReconciler(promiseControllerID, func(c ctrlclient.Client) tracecheck.Reconciler { + nsClient := &defaultNamespaceClient{Client: c, namespace: "default"} + manager := &controllerfakes.FakeManager{} + skipNameValidation := true + manager.GetControllerOptionsReturns(controllerconfig.Controller{SkipNameValidation: &skipNameValidation}) + manager.GetEventRecorderForReturns(record.NewFakeRecorder(32)) + return &controller.PromiseReconciler{ + Client: nsClient, + ApiextensionsClient: fakeclientset.NewSimpleClientset().ApiextensionsV1(), + Log: ctrl.Log.WithName("promise"), + Manager: manager, + EventRecorder: record.NewFakeRecorder(32), + PromiseUpgrade: true, + NumberOfJobsToKeep: 1, + ReconciliationInterval: time.Hour, + RestartManager: func() {}, + StartedDynamicControllers: copyDynamicControllers(preStarted), + } + }).For(promiseKind) + eb.WithReconciler(promiseRevisionControllerID, func(c ctrlclient.Client) tracecheck.Reconciler { + nsClient := &defaultNamespaceClient{Client: c, namespace: "default"} + return &controller.PromiseRevisionReconciler{ + Client: nsClient, + Log: ctrl.Log.WithName("promise-revision"), + EventRecorder: record.NewFakeRecorder(32), + } + }).For(promiseRevisionKind) + eb.WithResourceDep(promiseKind, promiseControllerID) + eb.WithResourceDep(promiseRevisionKind, promiseRevisionControllerID) +} + +func buildInputDrivenBuilder(inputs []coverage.Input) (*tracecheck.ExplorerBuilder, error) { + specs := dynamicControllerSpecsFromInputs(inputs) + prestarted := preStartedDynamicControllersFromSpecs(specs) + + eb := newKratixExplorerBuilder() + configureWorksReconcilers(eb) + configureDynamicRequestReconcilers(eb, specs) + configurePromisesReconcilers(eb, prestarted) + return eb, nil +} + +func buildWorksFlow() (*tracecheck.ExplorerBuilder, tracecheck.StateNode, error) { + eb := newKratixExplorerBuilder() + configureWorksReconcilers(eb) work := &kratix.Work{ ObjectMeta: metav1.ObjectMeta{ @@ -200,42 +282,8 @@ func buildWorksFlow() (*tracecheck.ExplorerBuilder, tracecheck.StateNode, error) } func buildPromisesFlow() (*tracecheck.ExplorerBuilder, tracecheck.StateNode, error) { - scheme := runtime.NewScheme() - utilruntime.Must(kratix.AddToScheme(scheme)) - utilruntime.Must(clientgoscheme.AddToScheme(scheme)) - utilruntime.Must(kratix.AddToScheme(clientgoscheme.Scheme)) - - ctrl.SetLogger(zap.New(zap.UseDevMode(true))) - tracecheck.SetLogger(ctrl.Log.WithName("tracecheck")) - - const promiseControllerID = "PromiseController" - const promiseRevisionControllerID = "PromiseRevisionController" - promiseKind := "platform.kratix.io/Promise" - promiseRevisionKind := "platform.kratix.io/PromiseRevision" - - eb := tracecheck.NewExplorerBuilder(scheme) - eb.WithMaxDepth(100) - eb.WithReconciler(promiseControllerID, func(c ctrlclient.Client) tracecheck.Reconciler { - nsClient := &defaultNamespaceClient{Client: c, namespace: "default"} - return &controller.PromiseReconciler{ - Client: nsClient, - Log: ctrl.Log.WithName("promise"), - EventRecorder: record.NewFakeRecorder(32), - PromiseUpgrade: true, - NumberOfJobsToKeep: 1, - ReconciliationInterval: time.Hour, - } - }).For(promiseKind) - eb.WithReconciler(promiseRevisionControllerID, func(c ctrlclient.Client) tracecheck.Reconciler { - nsClient := &defaultNamespaceClient{Client: c, namespace: "default"} - return &controller.PromiseRevisionReconciler{ - Client: nsClient, - Log: ctrl.Log.WithName("promise-revision"), - EventRecorder: record.NewFakeRecorder(32), - } - }).For(promiseRevisionKind) - eb.WithResourceDep(promiseKind, promiseControllerID) - eb.WithResourceDep(promiseRevisionKind, promiseRevisionControllerID) + eb := newKratixExplorerBuilder() + configurePromisesReconcilers(eb, nil) promise := &kratix.Promise{ ObjectMeta: metav1.ObjectMeta{ @@ -268,6 +316,303 @@ func buildPromisesFlow() (*tracecheck.ExplorerBuilder, tracecheck.StateNode, err return eb, initialState, nil } -func scenariosFromInputs(_ *tracecheck.ExplorerBuilder, _ []coverage.Input) ([]explore.Scenario, error) { - return nil, fmt.Errorf("input to scenario conversion not implemented") +func scenariosFromInputs(builder *tracecheck.ExplorerBuilder, inputs []coverage.Input) ([]explore.Scenario, error) { + if builder == nil { + return nil, fmt.Errorf("builder is nil") + } + if len(inputs) == 0 { + return nil, fmt.Errorf("no inputs supplied") + } + + scenarios := make([]explore.Scenario, 0, len(inputs)) + for idx, input := range inputs { + state, seededObjects, err := buildStateFromCoverageInput(builder, input) + if err != nil { + return nil, fmt.Errorf("build start state for input %d (%s): %w", idx, input.Name, err) + } + userInputs, err := buildUserActionsFromCoverageInput(input, seededObjects) + if err != nil { + return nil, fmt.Errorf("build user actions for input %d (%s): %w", idx, input.Name, err) + } + + scenarios = append(scenarios, explore.Scenario{ + Name: input.Name, + EnvironmentState: state, + UserInputs: userInputs, + Config: applyInputTuning(builder.Config(), input.Tuning), + }) + } + + if len(scenarios) == 0 { + return nil, fmt.Errorf("no scenarios produced") + } + return scenarios, nil +} + +func buildStateFromCoverageInput( + builder *tracecheck.ExplorerBuilder, + input coverage.Input, +) (tracecheck.StateNode, []ctrlclient.Object, error) { + objects := make([]ctrlclient.Object, 0, len(input.EnvironmentState.Objects)) + for _, obj := range input.EnvironmentState.Objects { + if obj == nil { + continue + } + objects = append(objects, obj.DeepCopy()) + } + if len(objects) == 0 { + return tracecheck.StateNode{}, nil, fmt.Errorf("input has no environment objects") + } + + pending := initialPendingReconciles(objects) + state, err := builder.BuildStartStateFromObjects(objects, pending) + if err != nil { + return tracecheck.StateNode{}, nil, err + } + return state, objects, nil +} + +func initialPendingReconciles(objects []ctrlclient.Object) []tracecheck.PendingReconcile { + pending := make([]tracecheck.PendingReconcile, 0) + for _, obj := range objects { + if obj == nil { + continue + } + controllerID, ok := controllerForObject(obj) + if !ok { + continue + } + pending = append(pending, tracecheck.PendingReconcile{ + ReconcilerID: controllerID, + Request: ctrl.Request{ + NamespacedName: ctrlclient.ObjectKeyFromObject(obj), + }, + Source: tracecheck.SourceStateChange, + }) + } + return pending +} + +func controllerForObject(obj ctrlclient.Object) (tracecheck.ReconcilerID, bool) { + gvk := obj.GetObjectKind().GroupVersionKind() + switch gvk.Group + "/" + gvk.Kind { + case "platform.kratix.io/Promise": + return promiseControllerID, true + case "platform.kratix.io/PromiseRevision": + return promiseRevisionControllerID, true + case "platform.kratix.io/Work": + return workControllerID, true + case "platform.kratix.io/WorkPlacement", "platform.kratix.io/Destination", "platform.kratix.io/BucketStateStore": + return workPlacementControllerID, true + default: + return "", false + } +} + +func dynamicControllerSpecsFromInputs(inputs []coverage.Input) []dynamicControllerSpec { + out := make([]dynamicControllerSpec, 0) + seen := make(map[string]struct{}) + add := func(obj *unstructured.Unstructured) { + spec, ok := dynamicControllerSpecForPromise(obj) + if !ok { + return + } + if _, exists := seen[spec.key]; exists { + return + } + seen[spec.key] = struct{}{} + out = append(out, spec) + } + + for _, input := range inputs { + for _, obj := range input.EnvironmentState.Objects { + add(obj) + } + for _, userInput := range input.UserInputs { + add(userInput.Object) + } + } + return out +} + +func dynamicControllerSpecForPromise(obj *unstructured.Unstructured) (dynamicControllerSpec, bool) { + if obj == nil { + return dynamicControllerSpec{}, false + } + if obj.GetKind() != "Promise" || !strings.HasPrefix(obj.GetAPIVersion(), "platform.kratix.io/") { + return dynamicControllerSpec{}, false + } + + promise := &kratix.Promise{} + if err := runtime.DefaultUnstructuredConverter.FromUnstructured(obj.Object, promise); err != nil { + return dynamicControllerSpec{}, false + } + if !promise.ContainsAPI() { + return dynamicControllerSpec{}, false + } + + gvk, crd, err := promise.GetAPI() + if err != nil || gvk == nil || crd == nil { + return dynamicControllerSpec{}, false + } + + controllerID := tracecheck.ReconcilerID(fmt.Sprintf("DynamicResourceRequestController/%s", promise.GetName())) + return dynamicControllerSpec{ + key: promise.GetDynamicControllerName(ctrl.Log.WithName("promise-prestart")), + controllerID: controllerID, + promiseName: promise.GetName(), + gvk: gvk, + crd: crd, + placeholder: &controller.DynamicResourceRequestController{}, + }, true +} + +func preStartedDynamicControllersFromSpecs(specs []dynamicControllerSpec) map[string]*controller.DynamicResourceRequestController { + if len(specs) == 0 { + return nil + } + out := make(map[string]*controller.DynamicResourceRequestController, len(specs)) + for _, spec := range specs { + out[spec.key] = spec.placeholder + } + return out +} + +func configureDynamicRequestReconcilers(eb *tracecheck.ExplorerBuilder, specs []dynamicControllerSpec) { + for _, spec := range specs { + spec := spec + rrKind := spec.gvk.Group + "/" + spec.gvk.Kind + + eb.WithReconciler(spec.controllerID, func(c ctrlclient.Client) tracecheck.Reconciler { + nsClient := &defaultNamespaceClient{Client: c, namespace: "default"} + enabled := true + canCreateResources := true + + controllerValue := controller.DynamicResourceRequestController{ + Client: nsClient, + GVK: spec.gvk, + Scheme: c.Scheme(), + PromiseIdentifier: spec.promiseName, + Log: ctrl.Log.WithName("dynamic-resource-request").WithName(spec.promiseName), + UID: "00000", + Enabled: &enabled, + CRD: spec.crd, + PromiseDestinationSelectors: nil, + CanCreateResources: &canCreateResources, + NumberOfJobsToKeep: 1, + ReconciliationInterval: time.Hour, + EventRecorder: record.NewFakeRecorder(32), + PromiseUpgrade: true, + } + *spec.placeholder = controllerValue + return spec.placeholder + }).For(rrKind) + + eb.WithResourceDep(rrKind, spec.controllerID) + eb.WithResourceDep(workKind, spec.controllerID) + eb.WithResourceDep(resourceBindingKind, spec.controllerID) + } +} + +func copyDynamicControllers( + in map[string]*controller.DynamicResourceRequestController, +) map[string]*controller.DynamicResourceRequestController { + if len(in) == 0 { + return nil + } + out := make(map[string]*controller.DynamicResourceRequestController, len(in)) + for k, v := range in { + out[k] = v + } + return out +} + +func applyInputTuning(base tracecheck.ExploreConfig, tuning coverage.InputTuning) tracecheck.ExploreConfig { + cfg := base.Clone() + if tuning.MaxDepth > 0 { + cfg.MaxDepth = tuning.MaxDepth + } + if len(tuning.PermuteControllers) > 0 { + if cfg.Perturbations.PermuteOrder == nil { + cfg.Perturbations.PermuteOrder = make(map[tracecheck.ReconcilerID]bool) + } + for _, controllerID := range tuning.PermuteControllers { + cfg.Perturbations.PermuteOrder[tracecheck.ReconcilerID(controllerID)] = true + } + } + if len(tuning.StaleReads) > 0 { + if cfg.Perturbations.Staleness == nil { + cfg.Perturbations.Staleness = make(map[tracecheck.ReconcilerID]tracecheck.StalenessConfig) + } + for controllerID, kinds := range tuning.StaleReads { + id := tracecheck.ReconcilerID(controllerID) + staleness := cfg.Perturbations.Staleness[id] + if staleness.StaleReadBounds == nil { + staleness.StaleReadBounds = make(tracecheck.LookbackLimits) + } + for _, kind := range kinds { + trimmed := strings.TrimSpace(kind) + if trimmed == "" { + continue + } + lookback := tuning.StaleLookback[trimmed] + if lookback <= 0 { + lookback = 1 + } + staleness.StaleReadBounds[trimmed] = tracecheck.LookbackLimit(lookback) + } + cfg.Perturbations.Staleness[id] = staleness + } + } + return cfg +} + +func buildUserActionsFromCoverageInput( + input coverage.Input, + seededObjects []ctrlclient.Object, +) ([]tracecheck.UserAction, error) { + actions := make([]tracecheck.UserAction, 0, len(input.UserInputs)) + for idx, action := range input.UserInputs { + if action.Object == nil { + return nil, fmt.Errorf("input user input %d has nil object", idx) + } + id := strings.TrimSpace(action.ID) + if id == "" { + id = fmt.Sprintf("user-input-%d", idx) + } + opType := action.Type + if opType == event.CREATE && isInputObjectSeeded(action.Object, seededObjects) { + opType = event.UPDATE + } + actions = append(actions, tracecheck.UserAction{ + ID: id, + OpType: opType, + Payload: action.Object.DeepCopy(), + }) + } + return actions, nil +} + +func isInputObjectSeeded(object ctrlclient.Object, seededObjects []ctrlclient.Object) bool { + if object == nil { + return false + } + for _, seeded := range seededObjects { + if sameObjectIdentity(seeded, object) { + return true + } + } + return false +} + +func sameObjectIdentity(a, b ctrlclient.Object) bool { + if a == nil || b == nil { + return false + } + aGVK := a.GetObjectKind().GroupVersionKind() + bGVK := b.GetObjectKind().GroupVersionKind() + if aGVK.Group != bGVK.Group || aGVK.Kind != bGVK.Kind { + return false + } + return a.GetNamespace() == b.GetNamespace() && a.GetName() == b.GetName() } diff --git a/examples/kratix/flows_test.go b/examples/kratix/flows_test.go new file mode 100644 index 0000000..5728bdc --- /dev/null +++ b/examples/kratix/flows_test.go @@ -0,0 +1,125 @@ +package main + +import ( + "testing" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +func TestControllerForObject_PromiseMapsToPromiseController(t *testing.T) { + obj := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "platform.kratix.io/v1alpha1", + "kind": "Promise", + "metadata": map[string]any{ + "name": "p", + "namespace": "default", + }, + }, + } + got, ok := controllerForObject(obj) + if !ok { + t.Fatalf("controllerForObject() ok = false, want true") + } + if got != promiseControllerID { + t.Fatalf("controllerForObject() = %q, want %q", got, promiseControllerID) + } +} + +func TestInitialPendingReconciles_SeedsKnownKinds(t *testing.T) { + objects := []ctrlclient.Object{ + &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "platform.kratix.io/v1alpha1", + "kind": "Promise", + "metadata": map[string]any{ + "name": "p", + "namespace": "default", + }, + }, + }, + &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "platform.kratix.io/v1alpha1", + "kind": "Work", + "metadata": map[string]any{ + "name": "w", + "namespace": "default", + }, + }, + }, + &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "example.promise.syntasso.io/v1", + "kind": "EasyApp", + "metadata": map[string]any{ + "name": "example", + "namespace": "default", + }, + }, + }, + } + + pending := initialPendingReconciles(objects) + if len(pending) != 2 { + t.Fatalf("len(initialPendingReconciles()) = %d, want 2", len(pending)) + } + if pending[0].ReconcilerID != promiseControllerID { + t.Fatalf("pending[0].ReconcilerID = %q, want PromiseController", pending[0].ReconcilerID) + } + if pending[1].ReconcilerID != workControllerID { + t.Fatalf("pending[1].ReconcilerID = %q, want WorkController", pending[1].ReconcilerID) + } +} + +func TestDynamicControllerSpecForPromise_WithAPI(t *testing.T) { + obj := &unstructured.Unstructured{ + Object: map[string]any{ + "apiVersion": "platform.kratix.io/v1alpha1", + "kind": "Promise", + "metadata": map[string]any{ + "name": "easyapp", + }, + "spec": map[string]any{ + "api": map[string]any{ + "apiVersion": "apiextensions.k8s.io/v1", + "kind": "CustomResourceDefinition", + "metadata": map[string]any{ + "name": "easyapps.example.promise.syntasso.io", + }, + "spec": map[string]any{ + "group": "example.promise.syntasso.io", + "names": map[string]any{ + "kind": "EasyApp", + "plural": "easyapps", + "singular": "easyapp", + }, + "scope": "Namespaced", + "versions": []any{ + map[string]any{ + "name": "v1", + "served": true, + "storage": true, + }, + }, + }, + }, + }, + }, + } + + spec, ok := dynamicControllerSpecForPromise(obj) + if !ok { + t.Fatalf("dynamicControllerSpecForPromise() ok = false, want true") + } + if spec.key == "" { + t.Fatalf("dynamicControllerSpecForPromise() returned empty key") + } + if spec.controllerID == "" { + t.Fatalf("dynamicControllerSpecForPromise() returned empty controller ID") + } + if spec.gvk == nil || spec.crd == nil || spec.placeholder == nil { + t.Fatalf("dynamicControllerSpecForPromise() missing required fields") + } +} diff --git a/examples/kratix/main.go b/examples/kratix/main.go index 9d358ca..972bb52 100644 --- a/examples/kratix/main.go +++ b/examples/kratix/main.go @@ -19,40 +19,35 @@ var flows = map[string]flowBuilder{ } func main() { - flowName := flag.String("flow", "works", "flow to run (works|promises)") + flowName := flag.String("flow", "works", "flow to run for non-input mode (works|promises)") flag.Parse() - builderFn, ok := flows[*flowName] - if !ok { - fmt.Fprintf(os.Stderr, "unknown flow %q (valid: works, promises)\n", *flowName) - os.Exit(2) - } - - builder, initialState, err := builderFn() - if err != nil { - fmt.Fprintf(os.Stderr, "build flow %s: %v\n", *flowName, err) - os.Exit(1) - } - ctx := context.Background() - - if cfgPath := explore.ConfigPath(); cfgPath != "" { - loadedCfg, err := explore.LoadExploreConfigFromFile(cfgPath, builder.Config()) - if err != nil { - fmt.Fprintf(os.Stderr, "load explore config: %v\n", err) - os.Exit(1) - } - builder.SetConfig(loadedCfg) - } if inputsPath := explore.InputsPath(); inputsPath != "" { - if explore.InteractiveEnabled() { - fmt.Fprintln(os.Stderr, "interactive ignored in batch mode") + if *flowName != "works" { + fmt.Fprintln(os.Stderr, "flow flag ignored in input mode; behavior is derived from inputs") } inputs, err := coverage.LoadInputs(inputsPath) if err != nil { fmt.Fprintf(os.Stderr, "load inputs: %v\n", err) os.Exit(1) } + builder, err := buildInputDrivenBuilder(inputs) + if err != nil { + fmt.Fprintf(os.Stderr, "build input-driven builder: %v\n", err) + os.Exit(1) + } + if cfgPath := explore.ConfigPath(); cfgPath != "" { + loadedCfg, err := explore.LoadExploreConfigFromFile(cfgPath, builder.Config()) + if err != nil { + fmt.Fprintf(os.Stderr, "load explore config: %v\n", err) + os.Exit(1) + } + builder.SetConfig(loadedCfg) + } + if explore.InteractiveEnabled() { + fmt.Fprintln(os.Stderr, "interactive ignored in batch mode") + } scenarios, err := scenariosFromInputs(builder, inputs) if err != nil { fmt.Fprintf(os.Stderr, "convert inputs: %v\n", err) @@ -71,6 +66,25 @@ func main() { return } + builderFn, ok := flows[*flowName] + if !ok { + fmt.Fprintf(os.Stderr, "unknown flow %q (valid: works, promises)\n", *flowName) + os.Exit(2) + } + builder, initialState, err := builderFn() + if err != nil { + fmt.Fprintf(os.Stderr, "build flow %s: %v\n", *flowName, err) + os.Exit(1) + } + if cfgPath := explore.ConfigPath(); cfgPath != "" { + loadedCfg, err := explore.LoadExploreConfigFromFile(cfgPath, builder.Config()) + if err != nil { + fmt.Fprintf(os.Stderr, "load explore config: %v\n", err) + os.Exit(1) + } + builder.SetConfig(loadedCfg) + } + runner, err := explore.NewRunner(builder) if err != nil { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) diff --git a/examples/kratix/two-step-workflow.json b/examples/kratix/two-step-workflow.json new file mode 100644 index 0000000..c7d1df2 --- /dev/null +++ b/examples/kratix/two-step-workflow.json @@ -0,0 +1,119 @@ +[ + { + "name": "kratix-default/easyapp-create-then-update-image", + "environmentState": { + "objects": [ + { + "apiVersion": "platform.kratix.io/v1alpha1", + "kind": "Promise", + "metadata": { + "name": "easyapp", + "namespace": "default", + "labels": { + "kratix.io/promise-version": "v1" + } + }, + "spec": { + "api": { + "apiVersion": "apiextensions.k8s.io/v1", + "kind": "CustomResourceDefinition", + "metadata": { + "name": "easyapps.example.promise.syntasso.io" + }, + "spec": { + "group": "example.promise.syntasso.io", + "names": { + "kind": "EasyApp", + "plural": "easyapps", + "singular": "easyapp" + }, + "scope": "Namespaced", + "versions": [ + { + "name": "v1", + "served": true, + "storage": true, + "schema": { + "openAPIV3Schema": { + "type": "object", + "properties": { + "spec": { + "type": "object", + "required": [ + "name", + "image" + ], + "properties": { + "name": { + "type": "string" + }, + "image": { + "type": "string" + }, + "dbDriver": { + "type": "string", + "enum": [ + "postgres", + "none" + ] + }, + "containsCreditCardData": { + "type": "boolean" + } + } + } + } + } + } + } + ] + } + } + } + } + ] + }, + "tuning": { + "maxDepth": 0, + "permuteControllers": null, + "staleReads": null, + "staleLookback": null + }, + "userInputs": [ + { + "id": "create easyapp request", + "type": "CREATE", + "object": { + "apiVersion": "example.promise.syntasso.io/v1", + "kind": "EasyApp", + "metadata": { + "name": "example", + "namespace": "default" + }, + "spec": { + "name": "todo", + "image": "syntasso/sample-todo-app:v0.1.2", + "dbDriver": "postgres" + } + } + }, + { + "id": "update easyapp image", + "type": "UPDATE", + "object": { + "apiVersion": "example.promise.syntasso.io/v1", + "kind": "EasyApp", + "metadata": { + "name": "example", + "namespace": "default" + }, + "spec": { + "name": "todo", + "image": "syntasso/sample-todo-app:v0.1.3", + "dbDriver": "postgres" + } + } + } + ] + } +] From f3484a7981509c78a25eda29c27c53c56c3e1644 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Wed, 25 Feb 2026 18:17:56 -0800 Subject: [PATCH 25/29] deprecate the --dump-stats flag --- README.md | 4 +- .../2026-01-27-parallel-scenarios-design.md | 3 +- .../2026-02-02-inputs-file-harness-design.md | 8 +- ...inputs-file-harness-implementation-plan.md | 4 +- examples/AGENTS.md | 2 +- examples/crossplane/README.md | 4 +- examples/crossplane/main.go | 2 +- examples/karpenter/README.md | 2 +- examples/karpenter/main.go | 2 +- examples/knative-serving/AGENTS.md | 2 +- examples/knative-serving/README.md | 4 +- examples/knative-serving/main.go | 2 +- examples/kratix/README.md | 5 +- examples/kratix/main.go | 2 +- pkg/analysis/types.go | 7 +- pkg/explore/flags.go | 8 +- pkg/explore/flags_test.go | 11 ++- pkg/explore/parallel_runner.go | 41 +-------- pkg/explore/parallel_runner_test.go | 91 ++++++++++++++----- pkg/explore/runner.go | 57 ++++-------- pkg/explore/runner_test.go | 66 ++++++++++++++ pkg/interactive/inspector_dump.go | 27 +++++- 22 files changed, 219 insertions(+), 135 deletions(-) diff --git a/README.md b/README.md index f6299b8..a1704ba 100644 --- a/README.md +++ b/README.md @@ -117,7 +117,7 @@ go run . } ``` - `Runner` honors the standard `-interactive`, `-perturb`, and `-dump-output` flags (see `pkg/explore/flags.go`) so you can disable the inspector, skip closed-loop analysis reruns (`--perturb=false`), or persist results when scripting. + `Runner` honors the standard `-interactive`, `-perturb`, `-dump-output`, and `-emit-stats` flags (see `pkg/explore/flags.go`) so you can disable the inspector, skip closed-loop analysis reruns (`--perturb=false`), or persist results when scripting. When `--emit-stats` is enabled, the dump written via `--dump-output` includes a top-level `stats` section. That’s enough to start evaluating how your controllers interact across different interleavings. @@ -157,7 +157,7 @@ if err := interactive.SaveInspectorDump(states, resolver, "inspector_dump.json") } ``` -Dump files can be reopened at any time via `go run ./cmd/kamera inspect exploration inspector_dump.json`, which restores the same UI. The inspector provides keyboard shortcuts (shown in the status bar) to switch between states, examine individual reconcile steps, and export dumps from within the UI. +Dump files can be reopened at any time via `go run ./cmd/kamera inspect exploration inspector_dump.jsonl`, which restores the same UI. The inspector provides keyboard shortcuts (shown in the status bar) to switch between states, examine individual reconcile steps, and export dumps from within the UI. If the run used `--emit-stats`, that same dump file also carries top-level exploration stats. ### Unified CLI entrypoint diff --git a/docs/plans/2026-01-27-parallel-scenarios-design.md b/docs/plans/2026-01-27-parallel-scenarios-design.md index f5f635a..8661611 100644 --- a/docs/plans/2026-01-27-parallel-scenarios-design.md +++ b/docs/plans/2026-01-27-parallel-scenarios-design.md @@ -50,7 +50,6 @@ type ScenarioResult struct { type ParallelOptions struct { MaxParallel int DumpDir string - StatsDir string } type ParallelRunner struct { @@ -68,7 +67,7 @@ func (r *ParallelRunner) RunAll(ctx context.Context, scenarios []Scenario, opts 3. Apply the scenario config to the fork (clone first to avoid map sharing). 4. Build an explorer and run `Explore` on the scenario’s `InitialState`. 5. Evaluate invariant (if non-nil) against converged states; record first error. -6. Optionally dump results and stats under `DumpDir`/`StatsDir` with safe filenames. +6. Optionally dump results under `DumpDir` with safe filenames (stats are embedded in dump files when perf stats are enabled). ## Builder Refactor: Explicit Isolation diff --git a/docs/plans/2026-02-02-inputs-file-harness-design.md b/docs/plans/2026-02-02-inputs-file-harness-design.md index 70329c6..a1ceb9c 100644 --- a/docs/plans/2026-02-02-inputs-file-harness-design.md +++ b/docs/plans/2026-02-02-inputs-file-harness-design.md @@ -16,13 +16,13 @@ Harnesses keep their current single-scenario behavior, but add a batch mode: - `--inputs `: JSON file containing an array of `coverage.Input`. - `--dump-output `: directory for per-scenario dumps when `--inputs` is set. -- `--dump-stats `: directory for per-scenario stats when `--inputs` is set. +- `--emit-stats`: includes top-level stats in each dump file. - `--interactive`: ignored/disabled in batch mode. Example: ``` -go run ./examples/knative-serving --inputs ./inputs.json --dump-output ./dumps --dump-stats ./stats +go run ./examples/knative-serving --inputs ./inputs.json --dump-output ./dumps --emit-stats ``` ## Data Flow @@ -40,14 +40,14 @@ of work and will be plugged into step 4 when available. ## Parallel Runner Behavior - Use `ParallelRunner` for all inputs, preserving input order in results. -- Treat `--dump-output` and `--dump-stats` as **directories** in batch mode. +- Treat `--dump-output` as a **directory** in batch mode. - Force `interactive=false` (or error if explicitly set) since parallel runs do not surface the inspector UI. ## Error Handling - If `--inputs` is set but the file cannot be read/decoded: return an error. - If the decoded inputs list is empty: return an error. -- If `--dump-output` or `--dump-stats` points to a file: return an error that +- If `--dump-output` points to a file: return an error that requests a directory path. ## Testing diff --git a/docs/plans/2026-02-02-inputs-file-harness-implementation-plan.md b/docs/plans/2026-02-02-inputs-file-harness-implementation-plan.md index 5c161ed..537f36e 100644 --- a/docs/plans/2026-02-02-inputs-file-harness-implementation-plan.md +++ b/docs/plans/2026-02-02-inputs-file-harness-implementation-plan.md @@ -167,7 +167,7 @@ if inputsPath := explore.InputsPath(); inputsPath != "" { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) os.Exit(1) } - opts := explore.ParallelOptions{DumpDir: explore.DumpPath(), StatsDir: explore.DumpStatsPath()} + opts := explore.ParallelOptions{DumpDir: explore.DumpPath()} if _, err := runner.RunAll(ctx, scenarios, opts); err != nil { fmt.Fprintf(os.Stderr, "batch run error: %v\n", err) os.Exit(1) @@ -281,7 +281,7 @@ Not applicable; doc-only change. **Step 2: Make the change** Add a short "Batch inputs" snippet showing `--inputs`, `--dump-output`, and -`--dump-stats` usage. +`--emit-stats` usage. **Step 3: Commit** diff --git a/examples/AGENTS.md b/examples/AGENTS.md index 7b93a27..629b795 100644 --- a/examples/AGENTS.md +++ b/examples/AGENTS.md @@ -8,7 +8,7 @@ This guide documents the common headless workflow for running Kamera examples an - `-log-level `: `debug`, `info`, `warn`, or `error` (default: `info`). - `-interactive `: launch the inspector TUI (`true` by default). Set `-interactive=false` for headless runs. - `-dump-output `: write converged + aborted states to a dump file (works with `-interactive=false`). -- `-emit-stats`: record and print reconcile performance stats at the end. +- `-emit-stats`: record and print reconcile performance stats at the end, and embed them under `stats` in the `-dump-output` file. ## Determinize + cache notes (when needed) Some examples depend on upstream code that introduces nondeterministic values (timestamps, IDs). To reduce noise: diff --git a/examples/crossplane/README.md b/examples/crossplane/README.md index 1e73482..5a9457e 100644 --- a/examples/crossplane/README.md +++ b/examples/crossplane/README.md @@ -28,11 +28,11 @@ go run . \ ## Batch inputs -To run a generated inputs file, pass `--inputs` and set dump directories for per-scenario output: +To run a generated inputs file, pass `--inputs` and set a dump directory for per-scenario output: ```bash go run . \ --inputs /path/to/inputs.json \ --dump-output /tmp/crossplane-dumps \ - --dump-stats /tmp/crossplane-stats + --emit-stats ``` diff --git a/examples/crossplane/main.go b/examples/crossplane/main.go index 298fc77..87a647d 100644 --- a/examples/crossplane/main.go +++ b/examples/crossplane/main.go @@ -43,7 +43,7 @@ func main() { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) os.Exit(1) } - opts := explore.ParallelOptions{DumpDir: explore.DumpPath(), StatsDir: explore.DumpStatsPath()} + opts := explore.ParallelOptions{DumpDir: explore.DumpPath()} if _, err := runner.RunAll(ctx, scenarios, opts); err != nil { fmt.Fprintf(os.Stderr, "batch run error: %v\n", err) os.Exit(1) diff --git a/examples/karpenter/README.md b/examples/karpenter/README.md index 17eb5e1..f2a60f2 100644 --- a/examples/karpenter/README.md +++ b/examples/karpenter/README.md @@ -27,7 +27,7 @@ go run . \ --fuzz-cases 12 \ --fuzz-seed 1337 \ --dump-output /tmp/karpenter-dumps \ - --dump-stats /tmp/karpenter-stats + --emit-stats ``` - `--fuzz-cases` controls how many sampled parameterized variants are generated per input. diff --git a/examples/karpenter/main.go b/examples/karpenter/main.go index 43581cd..0b9445d 100644 --- a/examples/karpenter/main.go +++ b/examples/karpenter/main.go @@ -58,7 +58,7 @@ func main() { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) os.Exit(1) } - opts := explore.ParallelOptions{DumpDir: explore.DumpPath(), StatsDir: explore.DumpStatsPath()} + opts := explore.ParallelOptions{DumpDir: explore.DumpPath()} if _, err := runner.RunAll(ctx, scenarios, opts); err != nil { fmt.Fprintf(os.Stderr, "batch run error: %v\n", err) os.Exit(1) diff --git a/examples/knative-serving/AGENTS.md b/examples/knative-serving/AGENTS.md index 62f7acf..0b7071f 100644 --- a/examples/knative-serving/AGENTS.md +++ b/examples/knative-serving/AGENTS.md @@ -9,7 +9,7 @@ This example drives Kamera’s `Explorer` against a minimal Knative Serving setu - `-interactive `: launch the TUI inspector (`true` by default). Set `-interactive=false` for headless runs. - `-perturb `: controls batch input pipeline shape. `true` (default) runs reference+rerun phases for scenarios that define closed-loop plans; `false` forces a single pass. - `-dump-output `: write converged + aborted states to a file (works even when `-interactive=false`). -- `-emit-stats`: record and print reconcile performance stats at the end. +- `-emit-stats`: record and print reconcile performance stats at the end, and embed them under `stats` in the `-dump-output` file. ## preparing knative code for deterministic simulation testing Kamera's `Explorer` simulation searches the reconciliation execution space and identifies possible converged states. Because Knative code modifies Knative resources with nondeterministic values such as timestamps, these timestamp values can cause two semantically equivalent states to appear distinct, which is no good for our deterministic simulation strategy. So, we fix this by preprocessing Knative code + dependencies to be deterministic. diff --git a/examples/knative-serving/README.md b/examples/knative-serving/README.md index ac7ac5d..81c50bd 100644 --- a/examples/knative-serving/README.md +++ b/examples/knative-serving/README.md @@ -28,7 +28,7 @@ Behavior: go run . \ --parallel \ --dump-output /tmp/knative-dumps \ - --dump-stats /tmp/knative-stats + --emit-stats ``` To run a generated inputs file, pass `--inputs` and set dump directories for per-scenario output: @@ -40,5 +40,5 @@ go run . \ --depth 100 \ --timeout 60s \ --perturb=false \ - --dump-stats /tmp/knative-stats + --emit-stats ``` diff --git a/examples/knative-serving/main.go b/examples/knative-serving/main.go index e678082..3ea7991 100644 --- a/examples/knative-serving/main.go +++ b/examples/knative-serving/main.go @@ -173,7 +173,7 @@ func main() { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) os.Exit(1) } - opts := explore.ParallelOptions{DumpDir: explore.DumpPath(), StatsDir: explore.DumpStatsPath()} + opts := explore.ParallelOptions{DumpDir: explore.DumpPath()} if explore.PerturbEnabled() { fmt.Fprintln(os.Stderr, "closed-loop scaffold: running per-input reference->rerun pipelines") diff --git a/examples/kratix/README.md b/examples/kratix/README.md index d21c3fa..27771fa 100644 --- a/examples/kratix/README.md +++ b/examples/kratix/README.md @@ -50,14 +50,13 @@ go run . -flow=works -interactive=false ## Batch inputs -To run a generated inputs file, pass `--inputs` and set dump directories for per-scenario output: +To run a generated inputs file, pass `--inputs` and set a dump directory for per-scenario output: ```bash go run . \ - -flow=works \ --inputs /path/to/inputs.json \ --dump-output /tmp/kratix-dumps \ - --dump-stats /tmp/kratix-stats + --emit-stats ``` ## Flows diff --git a/examples/kratix/main.go b/examples/kratix/main.go index 972bb52..b4485c6 100644 --- a/examples/kratix/main.go +++ b/examples/kratix/main.go @@ -58,7 +58,7 @@ func main() { fmt.Fprintf(os.Stderr, "runner setup error: %v\n", err) os.Exit(1) } - opts := explore.ParallelOptions{DumpDir: explore.DumpPath(), StatsDir: explore.DumpStatsPath()} + opts := explore.ParallelOptions{DumpDir: explore.DumpPath()} if _, err := runner.RunAll(ctx, scenarios, opts); err != nil { fmt.Fprintf(os.Stderr, "batch run error: %v\n", err) os.Exit(1) diff --git a/pkg/analysis/types.go b/pkg/analysis/types.go index 4adde73..383fa51 100644 --- a/pkg/analysis/types.go +++ b/pkg/analysis/types.go @@ -9,9 +9,10 @@ import ( // Dump represents the top-level structure of a kamera dump file. // It contains all objects referenced by hash and the result states from exploration. type Dump struct { - Context *DumpContext `json:"context,omitempty"` - Objects []DumpObject `json:"objects"` - States []DumpResultState `json:"states"` + Context *DumpContext `json:"context,omitempty"` + Stats *tracecheck.ExploreStats `json:"stats,omitempty"` + Objects []DumpObject `json:"objects"` + States []DumpResultState `json:"states"` } // DumpContext carries optional metadata describing how the dump was produced. diff --git a/pkg/explore/flags.go b/pkg/explore/flags.go index e0b6573..5041559 100644 --- a/pkg/explore/flags.go +++ b/pkg/explore/flags.go @@ -4,9 +4,8 @@ import "flag" var ( interactiveFlag = flag.Bool("interactive", true, "launch interactive trace inspector") - dumpPathFlag = flag.String("dump-output", "", "optional path to write exploration results (converged + aborted) to disk") + dumpPathFlag = flag.String("dump-output", "", "optional path to write exploration dump to disk (states, plus stats when --emit-stats is enabled)") configPathFlag = flag.String("explore-config", "", "optional JSON file to configure exploration") - dumpStatsPath = flag.String("dump-stats", "", "optional path to write exploration stats (JSON)") inputsPathFlag = flag.String("inputs", "", `path to input JSON file`) perturbFlag = flag.Bool("perturb", true, "enable closed-loop rerun pipeline for batch inputs when supported by scenario generation") parallelProcessesFlag = flag.Bool("parallel-processes", false, "run batch mode using process-isolated child executions") @@ -32,11 +31,6 @@ func ConfigPath() string { return *configPathFlag } -// DumpStatsPath returns the parsed path for dumping exploration stats. -func DumpStatsPath() string { - return *dumpStatsPath -} - // InputsPath returns the parsed path to an optional inputs file. func InputsPath() string { return *inputsPathFlag diff --git a/pkg/explore/flags_test.go b/pkg/explore/flags_test.go index 2dc6acf..92c849c 100644 --- a/pkg/explore/flags_test.go +++ b/pkg/explore/flags_test.go @@ -1,6 +1,9 @@ package explore -import "testing" +import ( + "flag" + "testing" +) func TestInputsPathDefault(t *testing.T) { if InputsPath() != "" { @@ -25,3 +28,9 @@ func TestParallelChildIndexDefault(t *testing.T) { t.Fatalf("expected parallel-child-index default -1, got %d", ParallelChildIndex()) } } + +func TestDumpStatsFlagRemoved(t *testing.T) { + if got := flag.CommandLine.Lookup("dump-stats"); got != nil { + t.Fatalf("expected dump-stats flag to be removed") + } +} diff --git a/pkg/explore/parallel_runner.go b/pkg/explore/parallel_runner.go index a1bc97b..2d9875b 100644 --- a/pkg/explore/parallel_runner.go +++ b/pkg/explore/parallel_runner.go @@ -3,7 +3,6 @@ package explore import ( "bytes" "context" - "encoding/json" "fmt" "os" "os/exec" @@ -23,7 +22,6 @@ import ( type ParallelOptions struct { MaxParallel int DumpDir string - StatsDir string } type childProcessRequest struct { @@ -463,13 +461,6 @@ func (r *ParallelRunner) runScenarioPhase( artifactName = fmt.Sprintf("%s/%s", scenario.Name, phaseLabel) } - if opts.StatsDir != "" { - if err := writeScenarioStats(phase.Stats, opts.StatsDir, artifactName, idx); err != nil { - phase.Err = err - return phase - } - } - if opts.DumpDir != "" && res != nil { states := append([]tracecheck.ResultState{}, res.ConvergedStates...) states = append(states, res.AbortedStates...) @@ -483,6 +474,10 @@ func (r *ParallelRunner) runScenarioPhase( if len(attrs) == 0 { attrs = nil } + var dumpStats *tracecheck.ExploreStats + if cfg.RecordPerfStats { + dumpStats = phase.Stats + } dumpContext := &interactive.InspectorDumpContext{ ScenarioName: scenario.Name, ScenarioRunIndex: &runIdx, @@ -490,7 +485,7 @@ func (r *ParallelRunner) runScenarioPhase( InputRef: phaseCtx.InputRef, Attributes: attrs, } - if err := interactive.SaveInspectorDumpWithContext(states, phase.VersionManager, path, dumpContext); err != nil { + if err := interactive.SaveInspectorDumpWithContextAndStats(states, phase.VersionManager, path, dumpContext, dumpStats); err != nil { phase.Err = fmt.Errorf("dump scenario %s (%s): %w", scenario.Name, phaseLabel, err) return phase } @@ -507,11 +502,6 @@ func ensureParallelOutputDirs(opts ParallelOptions) error { return fmt.Errorf("create dump dir: %w", err) } } - if opts.StatsDir != "" { - if err := os.MkdirAll(opts.StatsDir, 0o755); err != nil { - return fmt.Errorf("create stats dir: %w", err) - } - } return nil } @@ -789,32 +779,11 @@ func joinIntList(values []int) string { return strings.Join(parts, ",") } -func writeScenarioStats(stats *tracecheck.ExploreStats, dir string, name string, idx int) error { - if stats == nil { - return nil - } - stats.Finish() - data, err := json.MarshalIndent(stats, "", " ") - if err != nil { - return fmt.Errorf("marshal explore stats: %w", err) - } - path := scenarioStatsPath(dir, name, idx) - if err := os.WriteFile(path, data, 0o644); err != nil { - return fmt.Errorf("write stats to %s: %w", path, err) - } - return nil -} - func scenarioDumpPath(dir, name string, idx int) string { base := scenarioFileBase(name, idx) return filepath.Join(dir, base+".jsonl") } -func scenarioStatsPath(dir, name string, idx int) string { - base := scenarioFileBase(name, idx) - return filepath.Join(dir, base+".json") -} - func scenarioFileBase(name string, idx int) string { base := sanitizeScenarioName(name) if idx >= 0 { diff --git a/pkg/explore/parallel_runner_test.go b/pkg/explore/parallel_runner_test.go index cdba091..5b352ac 100644 --- a/pkg/explore/parallel_runner_test.go +++ b/pkg/explore/parallel_runner_test.go @@ -59,14 +59,14 @@ func TestParallelRunnerDoesNotLeakConfig(t *testing.T) { scenarios := []Scenario{ { - Name: "max-depth-low", + Name: "max-depth-low", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 1}, + Config: tracecheck.ExploreConfig{MaxDepth: 1}, }, { - Name: "max-depth-normal", + Name: "max-depth-normal", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 5}, + Config: tracecheck.ExploreConfig{MaxDepth: 5}, }, } @@ -107,9 +107,9 @@ func TestParallelRunnerWritesDump(t *testing.T) { scenarios := []Scenario{ { - Name: "Foo Scenario", + Name: "Foo Scenario", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 5}, + Config: tracecheck.ExploreConfig{MaxDepth: 5}, Context: ScenarioContext{ Workflow: "smoke-workflow", InputRef: "inputs.json#foo-scenario", @@ -171,6 +171,51 @@ func TestParallelRunnerWritesDump(t *testing.T) { } } +func TestParallelRunnerEmbedsStatsInDumpWhenPerfStatsEnabled(t *testing.T) { + ctx := context.Background() + builder, state := newTestBuilder(t) + + runner, err := NewParallelRunner(builder) + if err != nil { + t.Fatalf("new runner: %v", err) + } + + dumpDir := t.TempDir() + + scenarios := []Scenario{ + { + Name: "with-perf-stats", + EnvironmentState: state.Clone(), + Config: tracecheck.ExploreConfig{ + MaxDepth: 5, + RecordPerfStats: true, + }, + }, + } + + results, err := runner.RunAll(ctx, scenarios, ParallelOptions{DumpDir: dumpDir}) + if err != nil { + t.Fatalf("run all: %v", err) + } + if len(results) != 1 { + t.Fatalf("expected 1 result, got %d", len(results)) + } + if results[0].DumpPath == "" { + t.Fatalf("expected dump path") + } + + dump, err := analysis.LoadDump(results[0].DumpPath) + if err != nil { + t.Fatalf("load dump: %v", err) + } + if dump.Stats == nil { + t.Fatalf("expected stats embedded in dump") + } + if dump.Stats.TotalNodeVisits == 0 { + t.Fatalf("expected embedded stats to include node visits") + } +} + func TestParallelRunnerCapturesInvariantError(t *testing.T) { ctx := context.Background() builder, state := newTestBuilder(t) @@ -182,9 +227,9 @@ func TestParallelRunnerCapturesInvariantError(t *testing.T) { scenarios := []Scenario{ { - Name: "invariant-fails", + Name: "invariant-fails", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 5}, + Config: tracecheck.ExploreConfig{MaxDepth: 5}, Invariant: func(tracecheck.StateNode) error { return errors.New("invariant failed") }, @@ -215,9 +260,9 @@ func TestParallelRunnerClosedLoopRunsReferenceThenRerunPerScenario(t *testing.T) plannerCalls := 0 scenarios := []Scenario{ { - Name: "closed-loop", + Name: "closed-loop", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 1}, + Config: tracecheck.ExploreConfig{MaxDepth: 1}, ClosedLoop: &ClosedLoopSpec{ Plan: func(reference ScenarioPhaseResult) ([]ScenarioPhasePlan, error) { plannerCalls++ @@ -273,7 +318,7 @@ func TestParallelRunnerClosedLoopDisablesRerunWhenPerturbDisabled(t *testing.T) scenarios := []Scenario{ { Name: "closed-loop-disabled", - EnvironmentState: state.Clone(), + EnvironmentState: state.Clone(), Config: tracecheck.ExploreConfig{MaxDepth: 1}, ClosedLoop: &ClosedLoopSpec{ Plan: func(reference ScenarioPhaseResult) ([]ScenarioPhasePlan, error) { @@ -326,9 +371,9 @@ func TestParallelRunnerClosedLoopWritesPhaseDumps(t *testing.T) { dumpDir := t.TempDir() scenarios := []Scenario{ { - Name: "closed-loop-dump", + Name: "closed-loop-dump", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 1}, + Config: tracecheck.ExploreConfig{MaxDepth: 1}, ClosedLoop: &ClosedLoopSpec{ Plan: func(reference ScenarioPhaseResult) ([]ScenarioPhasePlan, error) { return []ScenarioPhasePlan{ @@ -371,9 +416,9 @@ func TestParallelRunnerClosedLoopPrefixHistoryHashesAreDumpable(t *testing.T) { dumpDir := t.TempDir() scenarios := []Scenario{ { - Name: "closed-loop-prefix-hashes", + Name: "closed-loop-prefix-hashes", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 5}, + Config: tracecheck.ExploreConfig{MaxDepth: 5}, ClosedLoop: &ClosedLoopSpec{ Plan: func(reference ScenarioPhaseResult) ([]ScenarioPhasePlan, error) { if reference.Result == nil || len(reference.Result.ConvergedStates) == 0 { @@ -471,9 +516,9 @@ func TestParallelRunnerProcessModeRequiresInputsFile(t *testing.T) { scenarios := []Scenario{ { - Name: "x", + Name: "x", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 3}, + Config: tracecheck.ExploreConfig{MaxDepth: 3}, }, } @@ -496,19 +541,19 @@ func TestParallelRunnerChildModeFailsWhenSelectedInputMapsToMultipleScenarios(t scenarios := []Scenario{ { - Name: "alpha/base", + Name: "alpha/base", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 3}, + Config: tracecheck.ExploreConfig{MaxDepth: 3}, }, { - Name: "alpha/single/foo", + Name: "alpha/single/foo", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 3}, + Config: tracecheck.ExploreConfig{MaxDepth: 3}, }, { - Name: "beta", + Name: "beta", EnvironmentState: state.Clone(), - Config: tracecheck.ExploreConfig{MaxDepth: 3}, + Config: tracecheck.ExploreConfig{MaxDepth: 3}, }, } diff --git a/pkg/explore/runner.go b/pkg/explore/runner.go index c3f6ac7..00cf502 100644 --- a/pkg/explore/runner.go +++ b/pkg/explore/runner.go @@ -2,36 +2,14 @@ package explore import ( "context" - "encoding/json" "fmt" "os" - "path/filepath" - "strings" "github.com/tgoodwin/kamera/pkg/interactive" "github.com/tgoodwin/kamera/pkg/tracecheck" "golang.org/x/exp/slices" ) -func dumpStatsIfRequested(stats *tracecheck.ExploreStats, runIdx int) error { - if DumpStatsPath() == "" || stats == nil { - return nil - } - stats.Finish() - - data, err := json.MarshalIndent(stats, "", " ") - if err != nil { - return fmt.Errorf("marshal explore stats: %w", err) - } - - target := withRunSuffix(DumpStatsPath(), runIdx) - if err := os.WriteFile(target, data, 0o644); err != nil { - return fmt.Errorf("write stats to %s: %w", target, err) - } - fmt.Printf("wrote stats to %s\n", target) - return nil -} - func standaloneDumpContext(runIdx int) *interactive.InspectorDumpContext { index := runIdx attributes := map[string]string{} @@ -51,13 +29,11 @@ func standaloneDumpContext(runIdx int) *interactive.InspectorDumpContext { } } -func withRunSuffix(base string, runIdx int) string { - if runIdx == 0 { - return base +func statsForDump(stats *tracecheck.ExploreStats, cfg tracecheck.ExploreConfig) *tracecheck.ExploreStats { + if stats == nil || !cfg.RecordPerfStats { + return nil } - ext := filepath.Ext(base) - prefix := strings.TrimSuffix(base, ext) - return fmt.Sprintf("%s.run%d%s", prefix, runIdx, ext) + return stats } // Runner coordinates exploration runs and the inspector UI, including restart requests. @@ -72,7 +48,7 @@ type Runner struct { // UserActions represent declarative workload actions applied at boundaries during exploration. type RunInput struct { EnvironmentState tracecheck.StateNode - UserActions []tracecheck.UserAction + UserActions []tracecheck.UserAction } // NewRunner constructs a Runner from a configured ExplorerBuilder. @@ -142,10 +118,6 @@ func (r *Runner) Run(ctx context.Context, input RunInput) error { } runIdx := 0 - if err := dumpStatsIfRequested(stats, 0); err != nil { - return err - } - states := append([]tracecheck.ResultState{}, res.ConvergedStates...) states = append(states, res.AbortedStates...) if len(states) == 0 { @@ -154,7 +126,13 @@ func (r *Runner) Run(ctx context.Context, input RunInput) error { } if DumpPath() != "" { - if err := interactive.SaveInspectorDumpWithContext(states, resolver, DumpPath(), standaloneDumpContext(runIdx)); err != nil { + if err := interactive.SaveInspectorDumpWithContextAndStats( + states, + resolver, + DumpPath(), + standaloneDumpContext(runIdx), + statsForDump(stats, currentConfig), + ); err != nil { return fmt.Errorf("failed to dump results to %s: %w", DumpPath(), err) } fmt.Printf("wrote results to %s\n", DumpPath()) @@ -200,9 +178,6 @@ func (r *Runner) Run(ctx context.Context, input RunInput) error { if nextResolver != nil { resolver = nextResolver } - if err := dumpStatsIfRequested(nextStats, runIdx); err != nil { - return err - } newStates := append([]tracecheck.ResultState{}, nextRes.ConvergedStates...) newStates = append(newStates, nextRes.AbortedStates...) if restart.PreserveHistory { @@ -211,7 +186,13 @@ func (r *Runner) Run(ctx context.Context, input RunInput) error { states = newStates } if DumpPath() != "" { - if err := interactive.SaveInspectorDumpWithContext(states, resolver, DumpPath(), standaloneDumpContext(runIdx)); err != nil { + if err := interactive.SaveInspectorDumpWithContextAndStats( + states, + resolver, + DumpPath(), + standaloneDumpContext(runIdx), + statsForDump(nextStats, currentConfig), + ); err != nil { return fmt.Errorf("failed to dump results to %s: %w", DumpPath(), err) } fmt.Printf("wrote results to %s\n", DumpPath()) diff --git a/pkg/explore/runner_test.go b/pkg/explore/runner_test.go index 21112c0..9187f20 100644 --- a/pkg/explore/runner_test.go +++ b/pkg/explore/runner_test.go @@ -59,3 +59,69 @@ func TestRunnerWritesDumpContext(t *testing.T) { t.Fatalf("expected explore config attribute in dump context") } } + +func TestRunnerEmbedsStatsInDumpWhenPerfStatsEnabled(t *testing.T) { + oldInteractive := *interactiveFlag + oldDumpPath := *dumpPathFlag + t.Cleanup(func() { + *interactiveFlag = oldInteractive + *dumpPathFlag = oldDumpPath + }) + + *interactiveFlag = false + dumpPath := filepath.Join(t.TempDir(), "runner-dump.jsonl") + *dumpPathFlag = dumpPath + + builder, state := newTestBuilder(t) + builder.WithPerfStats() + runner, err := NewRunner(builder) + if err != nil { + t.Fatalf("new runner: %v", err) + } + + if err := runner.Run(context.Background(), RunInput{EnvironmentState: state}); err != nil { + t.Fatalf("run runner: %v", err) + } + + dump, err := analysis.LoadDump(dumpPath) + if err != nil { + t.Fatalf("load dump: %v", err) + } + if dump.Stats == nil { + t.Fatalf("expected stats embedded in dump") + } + if dump.Stats.TotalNodeVisits == 0 { + t.Fatalf("expected embedded stats to include node visits") + } +} + +func TestRunnerOmitsStatsInDumpWhenPerfStatsDisabled(t *testing.T) { + oldInteractive := *interactiveFlag + oldDumpPath := *dumpPathFlag + t.Cleanup(func() { + *interactiveFlag = oldInteractive + *dumpPathFlag = oldDumpPath + }) + + *interactiveFlag = false + dumpPath := filepath.Join(t.TempDir(), "runner-dump.jsonl") + *dumpPathFlag = dumpPath + + builder, state := newTestBuilder(t) + runner, err := NewRunner(builder) + if err != nil { + t.Fatalf("new runner: %v", err) + } + + if err := runner.Run(context.Background(), RunInput{EnvironmentState: state}); err != nil { + t.Fatalf("run runner: %v", err) + } + + dump, err := analysis.LoadDump(dumpPath) + if err != nil { + t.Fatalf("load dump: %v", err) + } + if dump.Stats != nil { + t.Fatalf("expected stats to be omitted when perf stats are disabled") + } +} diff --git a/pkg/interactive/inspector_dump.go b/pkg/interactive/inspector_dump.go index 4d7ffce..08fdb6a 100644 --- a/pkg/interactive/inspector_dump.go +++ b/pkg/interactive/inspector_dump.go @@ -34,7 +34,19 @@ func SaveInspectorDump(states []tracecheck.ResultState, resolver tracecheck.Vers // SaveInspectorDumpWithContext serializes inspector states and includes optional scenario metadata. func SaveInspectorDumpWithContext(states []tracecheck.ResultState, resolver tracecheck.VersionManager, path string, ctx *InspectorDumpContext) error { - dump, err := buildInspectorDump(states, resolver, ctx) + return SaveInspectorDumpWithContextAndStats(states, resolver, path, ctx, nil) +} + +// SaveInspectorDumpWithContextAndStats serializes inspector states, optional scenario metadata, +// and optional exploration stats. +func SaveInspectorDumpWithContextAndStats( + states []tracecheck.ResultState, + resolver tracecheck.VersionManager, + path string, + ctx *InspectorDumpContext, + stats *tracecheck.ExploreStats, +) error { + dump, err := buildInspectorDump(states, resolver, ctx, stats) if err != nil { return err } @@ -60,9 +72,17 @@ func LoadInspectorDump(path string) ([]tracecheck.ResultState, tracecheck.Versio return dumpToResultStates(dump) } -func buildInspectorDump(states []tracecheck.ResultState, resolver tracecheck.VersionManager, context *InspectorDumpContext) (*analysis.Dump, error) { +func buildInspectorDump( + states []tracecheck.ResultState, + resolver tracecheck.VersionManager, + context *InspectorDumpContext, + stats *tracecheck.ExploreStats, +) (*analysis.Dump, error) { if len(states) == 0 { - return &analysis.Dump{Context: buildAnalysisDumpContext(context)}, nil + return &analysis.Dump{ + Context: buildAnalysisDumpContext(context), + Stats: stats, + }, nil } if resolver == nil { @@ -152,6 +172,7 @@ func buildInspectorDump(states []tracecheck.ResultState, resolver tracecheck.Ver return &analysis.Dump{ Context: buildAnalysisDumpContext(context), + Stats: stats, Objects: objects, States: resultStates, }, nil From cde3af1a8b3841ef2e1aa881569f6d6a01261dee Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Wed, 25 Feb 2026 18:27:23 -0800 Subject: [PATCH 26/29] Check in beads state --- .beads/issues.jsonl | 1 + 1 file changed, 1 insertion(+) diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index e7b856a..fbedc91 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -14,6 +14,7 @@ {"id":"kamera-4sr","title":"Find RevisionReconciler source in gomodcache","description":"Locate Knative serving RevisionReconciler code in ~/tmp/gomodcache to understand Active condition logic","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T21:30:33.710132-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T21:36:51.310185-08:00","closed_at":"2026-01-13T21:36:51.310185-08:00","close_reason":"Found RevisionReconciler source in knative.dev/serving@v0.46.5/pkg/reconciler/revision/. Key files: revision.go (main reconciler), reconcile_resources.go (reconcileDeployment, reconcilePA), revision_lifecycle.go (condition management). Active condition is set by PropagateAutoscalerStatus in reconcilePA."} {"id":"kamera-53o","title":"Add Watch for EndpointsController to trigger on Pod changes","description":"Root cause of Endpoints divergence: EndpointsController is not triggered when Pods change (e.g., become Ready). In real K8s, EndpointsController uses Watches(\u0026Pod{}, mapPodToServices). In kamera, WithResourceDepGK(Pod, EndpointsController) only declares deps for stuck detection, NOT triggering. Fix: Add .Watches('Pod', mapPodToServices) to EndpointsController registration in explorebuilder.go where mapPodToServices enqueues all Services in the Pod's namespace.","status":"open","priority":0,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:34:36.185422-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:34:36.185422-08:00"} {"id":"kamera-5h9","title":"remove --dump flag from inspect command, as its always required","status":"closed","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T16:36:39.809524-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-17T19:34:42.015994-08:00","closed_at":"2026-02-17T19:34:42.015994-08:00","close_reason":"Closed"} +{"id":"kamera-5j8","title":"karpenter in-memory state findings and simulation next steps","description":"## Context\nWe observed a regression where a Pod + NodePool example used to converge in ~31 steps, but now converges in ~10 when running:\n\n```bash\ngo run . --inputs inputs.json --depth 50\n```\n\nThis note summarizes what Karpenter source code reveals about why this can happen, and why it is not just an \"input pending reconciles\" problem.\n\n## Key Finding\nKarpenter controllers are not purely stateless functions over Kubernetes object snapshots.\nThey rely on controller-local in-memory state that must be hydrated by reconcile activity.\n\nIn particular, `state.Cluster` stores a large amount of derived runtime state and is consulted by provisioning and other controllers.\n\n## Evidence in Karpenter Source\n### 1. `state.Cluster` is a mutable, process-local cache\n`pkg/controllers/state/cluster.go` defines `Cluster` with in-memory maps such as:\n- `nodes`\n- `bindings`\n- `nodeNameToProviderID`\n- `nodeClaimNameToProviderID`\n- `nodePoolResources`\n- `NodePoolState`\n\nThis is created in-process by `NewCluster(...)` and not reconstructed from a single object read at use time.\n\n### 2. Provisioning is gated on internal state sync\nIn `pkg/controllers/provisioning/provisioner.go`, `Provisioner.Reconcile()` checks:\n- `if !p.cluster.Synced(ctx) { requeue }`\n\n`Cluster.Synced(ctx)` (in `cluster.go`) validates internal state against API-server lists before allowing scheduling to proceed.\n\nSo even if input objects are present, the provisioner can intentionally delay until internal state is hydrated.\n\n### 3. \"state.*\" informer controllers are state-shaping controllers\nKarpenter registers dedicated state informers:\n- `state.pod`\n- `state.node`\n- `state.nodeclaim`\n- `state.nodepool`\n\n(see `pkg/controllers/controllers.go`)\n\nThese controllers update in-memory state via methods like:\n- `cluster.UpdatePod(...)`\n- `cluster.UpdateNode(...)`\n- `cluster.UpdateNodeClaim(...)`\n- `cluster.MarkUnconsolidated()`\n\nNot all of these produce Kubernetes object writes, but they still change behavior of subsequent reconciles.\n\n### 4. `state.nodepool` has behavioral effect even without object writes\n`pkg/controllers/state/informer/nodepool.go` mainly calls:\n- `cluster.MarkUnconsolidated()`\n\nThis does not patch objects, but it mutates internal consolidation state used by disruption/provisioning logic.\n\n### 5. Additional in-memory state exists beyond `Cluster`\nKarpenter also keeps `nodepoolhealth.State` as an in-process rolling health tracker (`pkg/state/nodepoolhealth/tracker.go`).\n`nodepool.registrationhealth` and `nodeclaim.lifecycle` use this state to evaluate/update `NodeRegistrationHealthy` behavior.\n\nThis is another example of behavior depending on process-local memory, not just object snapshots.\n\n## Why 31 Steps Can Collapse to ~10\nA shorter run can occur when startup/materialization semantics are under-modeled:\n- Pod creation may trigger provisioner-related controllers.\n- But if startup-enqueued state informers (or equivalent dependency-triggered reconciles) do not run sufficiently, internal caches/history are not hydrated.\n- Provisioning and related controllers then take early-exit/requeue paths, producing a much shorter explored trace.\n\nSo the regression is consistent with missing semantic unfolding of controller-local state materialization, not merely a wrong user input.\n\n## Implication for Kamera Model\nThe important modeling distinction is:\n- Not: users should specify pending reconciles in initial input state.\n- Instead: simulation semantics should compute reconcile queues from actions/subscriptions and include startup/watch hydration behavior so controller-local state is materialized as in real controller-runtime systems.\n\nPractical harness interpretation:\n1. Treat CREATE/UPDATE/DELETE actions as queue seeds via watch subscriptions.\n2. Include initial informer/bootstrap enqueue semantics for already-existing objects.\n3. Allow non-object-writing reconciles to matter when they mutate modeled controller-local state.\n4. Keep initial input focused on object state; derive pending reconciles through simulation semantics.\n\n## CREATE-Event Modeling Workaround\n### Idea\nAs a workaround, model existing startup objects as synthetic CREATE events so watchers enqueue reconciles without users specifying pending reconciles.\n\n### Where it can help\n- It can bootstrap `state.*` informer controllers from `environmentState`.\n- It can often recover the longer \"materialization first, then provisioning\" trace shape.\n- It keeps user-facing inputs simple (objects only, no explicit pending queue).\n\n### Suggested variant\nUse two phases:\n1. Startup bootstrap phase: for each object in initial state, synthesize a CREATE event; route through normal subscription/event-filter logic; execute resulting queue to fixed point (or bounded depth).\n2. User action phase: apply the user action (for example, Pod CREATE) and continue normal event-driven simulation.\n\n### Constraints and caveats\nCREATE-only modeling is useful but not equivalent to full controller-runtime semantics:\n- Some controllers depend on UPDATE semantics (generation/status transitions, predicate behavior).\n- DELETE-triggered maintenance is not represented by CREATE-only bootstrap.\n- Time/requeue-driven behavior is separate from watch-event behavior.\n- Synthetic CREATE for pre-existing objects may not mirror real startup ordering/interleavings.\n- It may over-approximate or under-approximate paths depending on predicates.\n\n### Practical recommendation\nTreat startup synthetic CREATE as a configurable bootstrap policy, not the entire simulation model. It is a bridge away from `initialDependentControllers`, but long-term correctness still needs UPDATE/DELETE/requeue semantics and controller-local state transitions.\n\n## Bottom Line\n`initialDependentControllers` worked as a tactical patch because it approximated missing startup/materialization behavior.\n\nA stronger direction is to:\n- derive pending reconciles via simulation semantics,\n- include startup bootstrap behavior, and\n- model relevant controller-local state transitions.\n\nA synthetic CREATE bootstrap mode can be an effective near-term workaround while moving toward that full model.\n\n## Action Items / Next Steps\n1. Add a configurable startup bootstrap policy that can synthesize CREATE events for all pre-existing objects in `environmentState`.\n2. Route bootstrap-generated events through the same watcher/subscription + predicate pipeline used for runtime events.\n3. Add deterministic tests for Karpenter-like flows that verify expected path length/shape (materialization-first vs early requeue).\n4. Extend semantics beyond CREATE: explicitly model UPDATE/DELETE-triggered enqueues and verify controller predicate handling.\n5. Model non-object-writing reconciles as state-affecting transitions when they mutate controller-local state.\n6. Audit and reduce reliance on manual `initialDependentControllers` in favor of derived queue semantics.\n7. Document known caveats of synthetic CREATE bootstrap and define acceptance criteria for “full semantics parity” milestones.\n","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-25T18:21:20.855725-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T18:21:20.855725-08:00","labels":["findings","karpenter","simulation"]} {"id":"kamera-636","title":"Explore field-level property queries across state space","description":"## Problem\n\nCurrent analysis operates at object-version granularity (hashes). Sometimes you want to query field-level properties across the explored state space:\n\n- \"Find all states where Revision R1's Ready condition is True\"\n- \"When did this annotation first appear?\"\n- \"Which paths have this label set to X vs Y?\"\n\nThis is hard because:\n1. Objects are stored by hash, not by field index\n2. Field-level diffing is expensive\n3. Queries need to traverse the entire state DAG\n\n## Exploration Areas\n\n### Indexing Strategies\n- Pre-compute field indexes during dump generation?\n- Lazy indexing on first query?\n- Which fields are worth indexing? (status conditions, labels, annotations)\n\n### Query Language\n- JSONPath-style queries? `$.status.conditions[?(@.type=='Ready')].status`\n- Predicate functions in Go?\n- SQL-like DSL?\n\n### Efficient Representation\n- Store normalized/flattened field paths alongside hashes?\n- Delta-encode field changes between versions?\n- Bloom filters for \"does this state contain field X with value Y\"?\n\n## Use Cases\n\n1. Condition propagation analysis: \"trace how Ready=True propagates through the object graph\"\n2. Annotation/label tracking: \"when did this finalizer get added?\"\n3. Spec drift detection: \"did the spec change after initial creation?\"\n\n## Notes\n\nThis is exploratory - don't over-engineer. Start with concrete use cases and see what's actually needed.","status":"open","priority":4,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T17:02:21.48698-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T17:02:21.48698-08:00"} {"id":"kamera-6xp","title":"Create pending queue analyzer for dump files","description":"Build a tool that analyzes the pending reconcile queue dynamics:\n- Show when each reconciler enters/exits the pending queue\n- Compare queue state across paths at equivalent points\n- Identify reconcilers that are triggered differently between paths\n- Highlight timing differences (e.g., 'EC in queue when Pod Ready' vs not)\n\nThis helps diagnose ordering-dependent divergences.","status":"open","priority":3,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:34.838358-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:27:34.838358-08:00"} {"id":"kamera-7tl","title":"Backward-trace divergence analysis framework","description":"A composable framework for analyzing divergence in kamera dump files, using backward-trace methodology.\n\n## Design Philosophy\n\nStart from known divergence (final state differences) and trace backwards to root cause. This avoids noise from 'diamond' patterns where paths diverge then reconverge.\n\n## Architecture\n\nThree composable modules in pkg/analysis/:\n\n**Module 0: Converged State Diff** (kamera-pwm)\n- Input: dump.jsonl\n- Output: Objects that differ between converged states\n- Answers: 'In what ways do the final states differ?'\n\n**Module 1: Last Write Analysis** (new task)\n- Input: dump.jsonl + differing object keys\n- Output: For each object+path, the step that produced its final value + reconciler's input state\n- Answers: 'What did the reconciler see when it wrote this value?'\n\n**Module 2: Object Lifecycle Analysis** (kamera-gwc)\n- Input: dump.jsonl + path + object + target hash\n- Output: All steps where object had that hash\n- Answers: 'Does this object take on this value at some point in this path?'\n\n## Composition Flow\n\n1. Run Module 0 to identify differing objects\n2. Run Module 1 on each differing object to find last writes and input diffs\n3. If inputs differ, run Module 2 to check 'does the missing state appear later?'\n4. If yes, points to watch/trigger issue; if no, points to state dependency\n\n## Package Structure\n\npkg/analysis/\n├── types.go # Shared types (moved from pkg/interactive)\n├── dump.go # Dump loading utilities \n├── diff.go # Module 0\n├── lastwrite.go # Module 1\n├── lifecycle.go # Module 2\n\n## Validation\n\nUse trial-1 dump as test fixture - should identify EndpointsController/Pod.Ready issue.","status":"open","priority":1,"issue_type":"epic","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T14:06:33.625361-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T14:06:33.625361-08:00"} From 2c1e15473d5ba50611277e75d41936a7afa79122 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Thu, 26 Feb 2026 06:33:18 -0800 Subject: [PATCH 27/29] fix kamera-2o2 --- .beads/issues.jsonl | 5 +++-- pkg/tracecheck/explore.go | 20 +++++++++++++++++++ pkg/tracecheck/explore_stack_test.go | 29 ++++++++++++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index fbedc91..1889646 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -6,10 +6,10 @@ {"id":"kamera-189","title":"Move dump types from pkg/interactive to pkg/analysis","description":"**Prerequisite for backward-trace framework (kamera-7tl)**\n\nMove dump-related types from pkg/interactive to pkg/analysis:\n\n## Types to Move\n- DumpOutput\n- ConvergedStateDump \n- PathStep\n- ObjectVersion\n- Related helper types\n\n## Rationale\npkg/interactive should be purely about powering the TUI. The dump types are analysis data structures that belong in pkg/analysis.\n\n## Changes Required\n1. Create pkg/analysis/types.go with moved types\n2. Create pkg/analysis/dump.go with dump loading utilities\n3. Update pkg/interactive to import from pkg/analysis\n4. Update any other code that imports these types","status":"closed","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T14:08:11.201209-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T15:58:30.284307-08:00","closed_at":"2026-01-14T15:58:30.284307-08:00","close_reason":"Implemented in commits 6fae4a5 through a5a4a0a"} {"id":"kamera-1r1","title":"Implement checkpoint scanner","description":"Implement a checkpoint scanner that takes a baseline trace plus []PerturbationProfile and emits checkpoint candidates (checkpoint state + profile + context).\n\nScanner behavior:\n- Ordering profiles: identify states where profiled controllers are simultaneously pending (or otherwise form a meaningful reorder fork).\n- Staleness profiles: identify steps where profiled reads can be served from concrete prior versions observed in the same baseline trace.\n- A single profile may produce multiple checkpoints per trace.\n\nRequirements:\n- Consume the trace contract formalized in kamera-fv4.\n- Make predicate logic explicit and testable for edge cases (no-op steps, repeated mutations, irrelevant profiles).\n","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-11T22:06:59.145429-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:19:49.260661-08:00","dependencies":[{"issue_id":"kamera-1r1","depends_on_id":"kamera-kse","type":"blocks","created_at":"2026-02-11T22:07:30.15809-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-1r1","depends_on_id":"kamera-fv4","type":"blocks","created_at":"2026-02-11T22:07:30.170662-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-1r1","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-11T22:08:33.726299-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-25g","title":"closed-loop: fork reference version store into restart/rerun phases","description":"## Context\nClosed-loop restart/rerun execution hit a state-store mismatch when history prefixing was enabled.\n\nObserved behavior before the fix:\n- Reference run completed and wrote a dump.\n- Rerun often failed to write a dump (especially in `--parallel-processes` mode) with:\n `unable to resolve object for hash ...`\n\nRepro command used:\n`go run . --inputs /tmp/knative-one.json --dump-output /tmp/knative-dumps --depth 100 --timeout 60s --dump-stats /tmp/knative-stats`\n\nWhat was happening:\n- Rerun phases used `HistoryPrefix` from reference execution.\n- Prefix steps referenced historical object hashes that existed in the reference run's version store.\n- The rerun phase store initially contained only seeded checkpoint/current state, not full prefix history hashes.\n- Dump writing resolves hashes for full path history and failed on missing prefix hashes.\n\nCurrent stopgap behavior:\n- We added priming logic that materializes referenced prefix hashes into rerun phase stores so dumps succeed.\n- This fixes the immediate failure but preserves an indirect coupling path.\n\n## Proposed Fix (Option 1)\nAdopt an explicit \"store fork\" model for restart/rerun phases:\n- Build rerun phases from a cloned/forked snapshot of the reference run version store.\n- Treat restart runs as a child continuation of the reference store lineage, instead of reconstructing needed hashes ad hoc.\n- Keep `RestartSeed`/`HistoryPrefix` semantics unchanged, but make backing object resolution guaranteed by construction.\n\nWhy this is cleaner:\n- Removes implicit dependency on post-hoc hash priming.\n- Makes state lineage explicit and easier to reason about.\n- Reduces risk of future missing-hash failures when prefix behavior evolves.\n\n## Scope\n- Introduce a first-class store fork/clone handoff at reference -\u003e rerun boundary.\n- Use forked store for all prefixed restart phases.\n- Retain existing closed-loop v0 behavior and APIs.\n- Keep output artifacts unchanged (reference/rerun dumps + stats).\n\n## Acceptance Criteria\n- Repro command above consistently writes both `reference` and `rerun` dumps.\n- `--parallel-processes` mode also writes both dumps without hash resolution failures.\n- `RestartSeed` + `HistoryPrefix` path exercises continue to work with no priming-specific code path required.\n- Regression test(s) cover prefixed rerun dumpability under forked-store design.\n- No external CLI/API changes required for users.\n\n## Notes\nThis task is architectural cleanup/simplification after validating closed-loop v0 behavior.\n","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-20T09:40:35.605268-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-20T09:40:35.605268-08:00","labels":["analysis","closed-loop","restart","trace-store"]} -{"id":"kamera-2o2","title":"Investigate subtree completion soundness regression in Crossplane example","description":"## Problem\nThe Crossplane example currently behaves differently based on optimization settings: it only produces useful exploration dumps when `subtreeCompletion` is disabled. This implies a soundness issue in subtree completion / diamond pruning behavior, because example correctness should not depend on this optimization being on/off.\n\n## Context From Current Investigation\n- Running the Crossplane example with default optimizations produced only user-action steps and no controller-progress dumps.\n- In this mode, phase dump writing saw no converged/aborted states (`res.ConvergedStates + res.AbortedStates == 0`).\n- Stats observed in this mode included `SubtreeDiamondSkips: 1`, `AbortedPaths: 0`, and visited depths only `0..3`.\n- When run with explore config:\n ```json\n {\n \"optimizations\": {\n \"subtreeCompletion\": false\n }\n }\n ```\n dumps appeared reliably and exploration progressed as expected.\n\n## Why This Is P0\nOptimization toggles should not change soundness. If subtree completion can suppress terminal-state recording or truncate exploration in valid runs, results are not trustworthy.\n\n## Scope\nUse the Crossplane example as the primary reproducer to deeply understand root cause in subtree completion logic, state dedupe/diamond detection, and terminal-state accounting.\n\n## Repro Starting Point\n- Example: `examples/crossplane`\n- Compare behavior with subtree completion enabled vs disabled via explore config.\n- Inspect `pkg/explore/parallel_runner.go` and tracecheck optimization paths tied to subtree completion/diamond skipping.\n\n## Acceptance Criteria\n- Root cause identified and documented (not just symptom).\n- Reproducer captured in issue notes with exact command/config.\n- Clear fix plan (or fix) that preserves soundness while retaining optimization benefits where valid.","status":"open","priority":0,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-02-25T17:43:53.172152-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:43:53.172152-08:00","labels":["crossplane","subtree-completion","tracecheck"]} +{"id":"kamera-2o2","title":"Investigate subtree completion soundness regression in Crossplane example","description":"## Problem\nThe Crossplane example currently behaves differently based on optimization settings: it only produces useful exploration dumps when `subtreeCompletion` is disabled. This implies a soundness issue in subtree completion / diamond pruning behavior, because example correctness should not depend on this optimization being on/off.\n\n## Context From Current Investigation\n- Running the Crossplane example with default optimizations produced only user-action steps and no controller-progress dumps.\n- In this mode, phase dump writing saw no converged/aborted states (`res.ConvergedStates + res.AbortedStates == 0`).\n- Stats observed in this mode included `SubtreeDiamondSkips: 1`, `AbortedPaths: 0`, and visited depths only `0..3`.\n- When run with explore config:\n ```json\n {\n \"optimizations\": {\n \"subtreeCompletion\": false\n }\n }\n ```\n dumps appeared reliably and exploration progressed as expected.\n\n## Why This Is P0\nOptimization toggles should not change soundness. If subtree completion can suppress terminal-state recording or truncate exploration in valid runs, results are not trustworthy.\n\n## Scope\nUse the Crossplane example as the primary reproducer to deeply understand root cause in subtree completion logic, state dedupe/diamond detection, and terminal-state accounting.\n\n## Repro Starting Point\n- Example: `examples/crossplane`\n- Compare behavior with subtree completion enabled vs disabled via explore config.\n- Inspect `pkg/explore/parallel_runner.go` and tracecheck optimization paths tied to subtree completion/diamond skipping.\n\n## Acceptance Criteria\n- Root cause identified and documented (not just symptom).\n- Reproducer captured in issue notes with exact command/config.\n- Clear fix plan (or fix) that preserves soundness while retaining optimization benefits where valid.","status":"open","priority":0,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-02-25T17:43:53.172152-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:43:53.172152-08:00","labels":["crossplane","subtree-completion","tracecheck"],"comments":[{"id":16,"issue_id":"kamera-2o2","author":"Tim Goodwin","text":"Root cause identified and validated.\n\nReproducer (before fix):\n1) Default optimizations (subtreeCompletion=true by default)\n go run . --inputs ./two-step-workflow.json -interactive=false -emit-stats --dump-output /tmp/crossplane-dumps-default\n Result: no files written under /tmp/crossplane-dumps-default\n\n2) With subtree completion disabled\n cat \u003e /tmp/crossplane-subtree-off.json \u003c\u003cJSON\n {\"optimizations\":{\"subtreeCompletion\":false}}\n JSON\n go run . --inputs ./two-step-workflow.json --explore-config /tmp/crossplane-subtree-off.json -interactive=false -emit-stats --dump-output /tmp/crossplane-dumps-subtree-off\n Result: dumps written for both scenarios.\n\nObserved behavior in default mode:\n- SubtreeDiamondSkips increments.\n- For single-input XR scenario, logical states repeat with identical pending reconcile while depth increases.\n- Branch is dropped with no converged/aborted terminal state emitted.\n\nRoot cause:\nsubtree completion classified all in-progress logical-key revisits as \"diamond convergence\" and skipped them. For cycle back-edges (state revisits same logical key through ancestry), this is unsound: the branch is still active and should continue until it terminates (e.g. max-depth abort). Skipping it silently removes terminal accounting and suppresses dump output in phase runner (which writes only when converged+aborted \u003e 0).\n\nFix:\nIn enqueueWithMarker, distinguish true diamonds from back-edge cycles:\n- if logical key is in progress and appears in ancestry =\u003e enqueue state(s) without adding a new marker (continue exploration)\n- else =\u003e preserve existing diamond skip behavior.\n\nValidation after fix:\n- go test ./pkg/tracecheck -run 'TestEnqueueStates|TestEnqueueStaleViews' (pass)\n- go test ./pkg/explore (pass)\n- go test ./... (examples/crossplane) (pass)\n- go run . --inputs ./two-step-workflow.json -interactive=false -emit-stats --dump-output /tmp/crossplane-dumps-default-fixed\n Result: dumps now written for both scenarios under default config.\n\nFiles changed:\n- pkg/tracecheck/explore.go\n- pkg/tracecheck/explore_stack_test.go\n","created_at":"2026-02-26T14:07:53Z"}]} {"id":"kamera-2wc","title":"Translate hotspot instances into scenario seeds","description":"Define rules to map hotspot instances (controllers/resources) into initial StateNode objects and ExploreOptions (permutation/stale scopes), including GVK-\u003eobject templates.","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-27T17:16:24.806448-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-27T17:16:24.806448-08:00"} {"id":"kamera-369","title":"DAG divergence analysis should consider pending reconcilers, not just ContentsHash","description":"Current LCA-based divergence detection uses ContentsHash which only considers objects. But paths can 'fan in' to same ContentsHash with different pending reconcilers, masking the true divergence point. Trial-1 shows: at 20nh30fc, State 0 has no EndpointsController pending while State 1 does - this is the real cause of divergence, not the ordering at 20nh30fc.","status":"open","priority":1,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T22:33:13.902039-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T22:33:13.902039-08:00","dependencies":[{"issue_id":"kamera-369","depends_on_id":"kamera-xw8","type":"blocks","created_at":"2026-01-13T22:43:57.7137-08:00","created_by":"Tim Goodwin"}],"comments":[{"id":1,"issue_id":"kamera-369","author":"Tim Goodwin","text":"Analysis of trial-1 shows the issue clearly:\n\n**Paths reach same ContentsHash (20nh30fc) with different pending lists:**\n- State 0: EndpointsController ran at step 33, not re-enqueued by step 39\n- State 1: EndpointsController ran at 30-31, but ServerlessServiceReconciler at step 33 re-enqueued it\n\n**Root cause pattern:**\n1. Different execution order causes different 'trigger' events\n2. Same objects reached, but different pending reconcilers\n3. DAG collapses these as same node (same ContentsHash)\n4. Actual divergence is masked - appears as ordering choice but is actually about what's pending\n\n**Implication:**\nThe divergence point in ContentsHash-based DAG is misleading. True divergence happens earlier when execution order affected which reconcilers got enqueued. Need full StateHash (objects + pending) for accurate divergence detection.","created_at":"2026-01-14T06:34:10Z"},{"id":2,"issue_id":"kamera-369","author":"Tim Goodwin","text":"This issue (pending-aware divergence) is related to but distinct from kamera-xw8 (field-level tracing). \n\nkamera-xw8 may be the better approach - rather than trying to build a perfect DAG that captures 'true' divergence, trace backwards from the actual differences in terminal states to find when they became inevitable.","created_at":"2026-01-14T06:43:57Z"}]} -{"id":"kamera-3d7","title":"Investigate Crossplane non-idempotency causing unbounded path growth","description":"## Problem\nCrossplane exploration appears non-idempotent and does not converge under current input/setup, with paths growing until depth cap abort.\n\n## Context From Current Investigation\n- At depth 20, observed path length 21 (aborted at cap).\n- At depth 100, observed path length 101.\n- At depth 500, observed path length 501.\n- This strongly suggests repeated non-idempotent reconcile behavior (or equivalent no-op writes triggering further work).\n\n## Requested Investigation\nPerform focused analysis at depth 50 to keep artifacts manageable while preserving signal.\n\n## Required Work\n1. Inspect Crossplane example inputs (externalized workflow JSON) to identify resource fields/status that may drive repeated updates.\n2. Inspect Crossplane source/controller behavior to determine which reconcile path is repeatedly mutating state or enqueueing work.\n3. Correlate dump steps with controller actions and object deltas to identify the specific non-idempotent write pattern.\n\n## Why This Is P0\nIf controller activity is effectively non-idempotent in exploration, coverage/path metrics and convergence-based reasoning are unreliable.\n\n## Acceptance Criteria\n- Concrete root-cause hypothesis tied to specific controller/object field(s).\n- Evidence from depth-50 run (step/delta references).\n- Recommended mitigation path (example/input adjustment, model correction, or controller-behavior handling in harness).","status":"open","priority":0,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-25T17:44:01.07775-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:44:01.07775-08:00","labels":["crossplane","nonidempotency","tracecheck"]} +{"id":"kamera-3d7","title":"Investigate Crossplane non-idempotency causing unbounded path growth","description":"## Problem\nCrossplane exploration appears non-idempotent and does not converge under current input/setup, with paths growing until depth cap abort.\n\n## Context From Current Investigation\n- At depth 20, observed path length 21 (aborted at cap).\n- At depth 100, observed path length 101.\n- At depth 500, observed path length 501.\n- This strongly suggests repeated non-idempotent reconcile behavior (or equivalent no-op writes triggering further work).\n\n## Requested Investigation\nPerform focused analysis at depth 50 to keep artifacts manageable while preserving signal.\n\n## Required Work\n1. Inspect Crossplane example inputs (externalized workflow JSON) to identify resource fields/status that may drive repeated updates.\n2. Inspect Crossplane source/controller behavior to determine which reconcile path is repeatedly mutating state or enqueueing work.\n3. Correlate dump steps with controller actions and object deltas to identify the specific non-idempotent write pattern.\n\n## Why This Is P0\nIf controller activity is effectively non-idempotent in exploration, coverage/path metrics and convergence-based reasoning are unreliable.\n\n## Acceptance Criteria\n- Concrete root-cause hypothesis tied to specific controller/object field(s).\n- Evidence from depth-50 run (step/delta references).\n- Recommended mitigation path (example/input adjustment, model correction, or controller-behavior handling in harness).","status":"open","priority":0,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-25T17:44:01.07775-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:44:01.07775-08:00","labels":["crossplane","nonidempotency","tracecheck"],"comments":[{"id":15,"issue_id":"kamera-3d7","author":"Tim Goodwin","text":"Reproduced and analyzed with depth-50 artifact.\n\nRepro command (from /Users/tgoodwin/projects/kamera/examples/crossplane):\nGOCACHE=~/tmp/gocache GOMODCACHE=~/tmp/gomodcache go run . \\\n --inputs two-step-workflow.json \\\n --parallel-processes --parallel-child-index=1 \\\n --perturb=false \\\n --explore-config /tmp/crossplane-no-subtree.json \\\n -depth 50 -timeout 0 -interactive=false \\\n --dump-output /tmp/crossplane-child-dump \\\n --log-level info --emit-stats\n\nWhere /tmp/crossplane-no-subtree.json is:\n{\n \"optimizations\": {\n \"subtreeCompletion\": false\n }\n}\n\nObserved:\n- Path length = 51 at depth=50 (aborted at cap).\n- Dump: /tmp/crossplane-child-dump/crossplane_default_composition_create_then_update_1.jsonl\n- Stats in dump: TotalNodeVisits=51, UniqueNodeVisits=6, AbortedPaths=1, NoOpReconciles=25.\n\nStep-level evidence from dump path[0]:\n- Step 3 (CompositionRevisionReconciler): UPDATE CompositionRevision/widget-composition-rollout-f4c54c1 from hash 33ab5e6a... -\u003e 12572c04..., delta adds status.conditions (Synced + ValidPipeline).\n- Step 5 (same reconciler): UPDATE same object, but before hash == after hash == 12572c04..., delta is empty string.\n- Steps 7..49 repeat the same no-delta UPDATE to the same hash.\n- Pending queue alternates forever:\n - after rev step: [CompositionReconciler(widget-composition-rollout), CompositionRevisionReconciler(widget-composition-rollout-f4c54c1)]\n - after composition step: [CompositionRevisionReconciler(widget-composition-rollout-f4c54c1)]\n\nRoot-cause hypothesis:\n1) Crossplane revision reconciler always calls status update on success:\n - ~/tmp/gomodcache/github.com/crossplane/crossplane/v2@v2.1.0/internal/controller/apiextensions/revision/reconciler.go\n - success path marks ValidPipeline then unconditionally executes r.client.Status().Update(ctx, rev).\n2) Kamera replay client records Status().Update as UPDATE effect every time:\n - /Users/tgoodwin/projects/kamera/pkg/replay/client.go (subResourceClient.Update -\u003e handleEffect(..., event.UPDATE, ...)).\n3) TriggerManager enqueues from every UPDATE effect without checking content delta:\n - /Users/tgoodwin/projects/kamera/pkg/tracecheck/trigger.go (getTriggered iterates effects and enqueues primary/watch reconcilers).\n\nSo after the first real status write, repeated no-content status updates are still emitted as UPDATE effects and keep re-enqueueing both reconcilers, causing unbounded non-convergent growth when subtree-completion optimization is disabled.\n\nRecommended mitigation path:\n- Preferred (harness/model correction): suppress state-change effects when UPDATE/PATCH/APPLY publishes the same object hash as pre-state. This prevents re-enqueue on no-content writes.\n- Alternative: in trigger stage, ignore UPDATE effects whose key+version hash did not change from stateBefore.\n- Secondary guard: optionally treat status-only no-delta updates as observational (non-triggering) in simulation mode.\n\nNote: with subtreeCompletion enabled (default), this loop is masked by diamond/subtree skipping, but underlying no-delta UPDATE emission remains and still blocks progression to later user actions in this scenario.\n","created_at":"2026-02-26T13:55:52Z"}]} {"id":"kamera-3hr","title":"Restructure CompositeKey/dump serialization to preserve Kind/Group without post-processing","description":"## Problem\n\nCompositeKey embeds both IdentityKey and ResourceKey, which both have Kind and Group fields. Go's encoding/json silently drops duplicate field names when serializing embedded structs, so only ObjectID, Namespace, and Name were being serialized.\n\nWe added workarounds:\n1. Custom MarshalJSON/UnmarshalJSON with explicit field names\n2. Backwards-compatible legacy format parsing \n3. Post-load enrichment from object data (extracting Kind from apiVersion/kind fields)\n\nThis is fragile and complex.\n\n## Proposed Solutions\n\nConsider restructuring upstream of dump serialization:\n\n1. **Don't embed structs in CompositeKey** - use named fields instead:\n ```go\n type CompositeKey struct {\n Identity IdentityKey\n Resource ResourceKey\n }\n ```\n This would require updating all call sites that access `key.Kind` to `key.Resource.Kind`.\n\n2. **Add JSON tags to embedded structs** - prefix field names to avoid collision:\n ```go\n type IdentityKey struct {\n Group string `json:\"identityGroup\"`\n Kind string `json:\"identityKind\"`\n ObjectID string `json:\"objectId\"`\n }\n ```\n But this changes the JSON format everywhere IdentityKey is used.\n\n3. **Consolidate Kind/Group into one location** - maybe CompositeKey shouldn't have Kind in both embedded structs. Review why both IdentityKey and ResourceKey need Kind.\n\n## Acceptance Criteria\n\n- [ ] CompositeKey serializes/deserializes correctly without custom Marshal/Unmarshal methods\n- [ ] No post-load enrichment needed\n- [ ] Backwards compatibility with existing dump files (migration path)","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T16:58:25.962755-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T16:58:25.962755-08:00"} {"id":"kamera-4sr","title":"Find RevisionReconciler source in gomodcache","description":"Locate Knative serving RevisionReconciler code in ~/tmp/gomodcache to understand Active condition logic","status":"closed","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-01-13T21:30:33.710132-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-13T21:36:51.310185-08:00","closed_at":"2026-01-13T21:36:51.310185-08:00","close_reason":"Found RevisionReconciler source in knative.dev/serving@v0.46.5/pkg/reconciler/revision/. Key files: revision.go (main reconciler), reconcile_resources.go (reconcileDeployment, reconcilePA), revision_lifecycle.go (condition management). Active condition is set by PropagateAutoscalerStatus in reconcilePA."} {"id":"kamera-53o","title":"Add Watch for EndpointsController to trigger on Pod changes","description":"Root cause of Endpoints divergence: EndpointsController is not triggered when Pods change (e.g., become Ready). In real K8s, EndpointsController uses Watches(\u0026Pod{}, mapPodToServices). In kamera, WithResourceDepGK(Pod, EndpointsController) only declares deps for stuck detection, NOT triggering. Fix: Add .Watches('Pod', mapPodToServices) to EndpointsController registration in explorebuilder.go where mapPodToServices enqueues all Services in the Pod's namespace.","status":"open","priority":0,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:34:36.185422-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T12:34:36.185422-08:00"} @@ -78,4 +78,5 @@ {"id":"kamera-yqn.3","title":"Unify restart semantics across interactive and parallel runners","description":"Generalize restart execution so both single-run and parallel-run code paths share the same restart contract and behavior. Specifically ensure RestartRequest semantics (seed + config + optional execution-history prefix/preserve-history intent) are supported consistently in both interactive subtree restarts and non-interactive/parallel branching. Deliverables: shared restart application helper/API, parity tests, and explicit documentation of what history/prefix is preserved in dumps/results.","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-12T14:18:24.53152-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:18:24.53152-08:00","dependencies":[{"issue_id":"kamera-yqn.3","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-12T14:18:24.532845-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-yqn.4","title":"Refresh coverage-strategy-v2 design doc against current simulator capabilities","description":"Revise docs/design/coverage-strategy-v2.md to explicitly account for the current state of Kamera: existing trace richness (stateBefore/stateAfter/pending/effects), restart primitives, and determinization/normalization already implemented in simulator tooling. Clarify what is truly missing vs already available, and update open questions/assumptions accordingly so implementation tasks target real gaps instead of redoing completed infrastructure.","status":"open","priority":2,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-12T14:18:42.931604-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-12T14:18:42.931604-08:00","dependencies":[{"issue_id":"kamera-yqn.4","depends_on_id":"kamera-yqn","type":"parent-child","created_at":"2026-02-12T14:18:42.932846-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-yqn.4","depends_on_id":"kamera-fv4","type":"blocks","created_at":"2026-02-12T14:22:01.107846-08:00","created_by":"Tim Goodwin"},{"issue_id":"kamera-yqn.4","depends_on_id":"kamera-yqn.1","type":"blocks","created_at":"2026-02-12T14:22:11.268119-08:00","created_by":"Tim Goodwin"}]} {"id":"kamera-z38","title":"Investigate informer cache timing in RevisionReconciler","description":"The RevisionReconciler's reconcilePA phase uses deploymentLister to fetch the deployment created by reconcileDeployment. Sometimes the lister doesn't see the newly created deployment, causing reconcilePA to return early without creating the PA. This results in nondeterministic exploration where some paths create PA and some don't.\n","status":"closed","priority":1,"issue_type":"bug","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T11:42:19.395997-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T17:43:47.720764-08:00","closed_at":"2026-02-25T17:43:47.720764-08:00","close_reason":"Closed"} +{"id":"kamera-zgf","title":"audit harness structure across knative/karpenter/kratix/crossplane","description":"## Goal\nAudit harness structure across `examples/knative`, `examples/karpenter`, `examples/kratix`, and `examples/crossplane` to assess structural similarities and identify where behavior is primarily driven by Kamera semantics + external inputs versus custom per-harness code.\n\n## Why\nWe need a clear map of where divergence in behavior is coming from:\n- Common Kamera runtime semantics\n- Input/state configuration differences\n- Custom harness-specific adapter logic (\"harness hacking\")\n\n## Key Questions\n1. What are the shared harness components and extension points across all four examples?\n2. Which behaviors are controlled by Kamera framework semantics (eventing, queueing, simulation loops, etc.)?\n3. Which behaviors are controlled by external inputs (`inputs.json`, scenario setup, config toggles)?\n4. Which behaviors depend on custom harness code and project-specific glue?\n5. Where are customizations unavoidable versus accidental/legacy?\n6. What standardization opportunities exist to reduce harness-specific drift?\n\n## Scope\n- Compare structure and execution flow for:\n - `examples/knative`\n - `examples/karpenter`\n - `examples/kratix`\n - `examples/crossplane`\n- Identify and classify per-harness logic into:\n - framework-driven\n - input-driven\n - harness-custom\n- Produce recommendations to reduce custom logic where feasible.\n\n## Deliverables\n1. A comparison matrix of the four harnesses:\n - bootstrapping\n - controller wiring\n - watch/event mapping\n - input loading\n - scenario/action injection\n - custom adapters/shims\n2. A behavior-attribution breakdown:\n - what is driven by Kamera core\n - what is driven by input data\n - what is driven by harness-specific code\n3. A prioritized list of refactor candidates to increase shared structure.\n4. Proposed acceptance checks to ensure future harnesses stay within the common pattern.\n\n## Action Items / Next Steps\n1. Inventory harness entrypoints and wiring paths for all four examples.\n2. Build a side-by-side structural diff focused on lifecycle stages (init, bootstrap, event flow, reconcile scheduling, termination).\n3. Trace 1-2 representative behavior differences per harness back to their root driver (core semantics vs inputs vs custom code).\n4. Identify duplicated custom logic and propose extraction points into shared utilities or framework interfaces.\n5. Define a minimal \"standard harness contract\" and list current gaps per example.\n6. Recommend short-term cleanup tasks and longer-term architectural changes.\n7. Capture findings in a single report with explicit evidence links to files/functions.\n\n## Acceptance Criteria\n- All four harnesses are analyzed with consistent comparison dimensions.\n- Every major behavior class has explicit attribution (Kamera, input, or harness custom code).\n- Report includes at least 5 concrete refactor/standardization opportunities with impact/risk notes.\n- Proposed contract is specific enough to guide implementation of future harnesses.\n","status":"open","priority":1,"issue_type":"task","owner":"timg.goodwin@gmail.com","created_at":"2026-02-25T18:28:53.019985-08:00","created_by":"Tim Goodwin","updated_at":"2026-02-25T18:28:53.019985-08:00","labels":["architecture","audit","harness"]} {"id":"kamera-znd","title":"Create causal chain tracer for divergence analysis","description":"**Superseded by kamera-7tl (Backward-trace divergence analysis framework)**\n\nThe original vision for this task has been refined into a composable module architecture. The 'causal chain tracing' is now achieved by composing:\n\n1. Module 0 (kamera-pwm): Identify what differs in final states\n2. Module 1 (kamera-eja): Find last writes and input diffs\n3. Module 2 (kamera-gwc): Check if 'missing' state appears later\n\nSee kamera-7tl for the full design.\n\n## Original Description (preserved)\nBuild a tool that traces how an initial divergence cascades to final state differences:\n- Given a divergence point, track how the difference propagates\n- Show which reconcilers are affected by the divergence\n- Identify when/if paths reconverge on specific objects\n- Produce a summary showing: initial cause -\u003e intermediate effects -\u003e final outcome","status":"open","priority":2,"issue_type":"feature","owner":"timg.goodwin@gmail.com","created_at":"2026-01-14T12:27:34.573893-08:00","created_by":"Tim Goodwin","updated_at":"2026-01-14T14:07:23.346227-08:00"} diff --git a/pkg/tracecheck/explore.go b/pkg/tracecheck/explore.go index 0e26675..6cdb21d 100644 --- a/pkg/tracecheck/explore.go +++ b/pkg/tracecheck/explore.go @@ -309,6 +309,15 @@ func (e *Explorer) enqueueWithMarker( // Already being explored? (diamond convergence) if tracker.isInProgress(logicalKey) { + // If the logical key appears in the ancestry of this state, this is a cycle + // back-edge (not a true diamond). Keep exploring so the branch can still + // terminate (e.g., max-depth abort) instead of being dropped silently. + if hasLogicalKeyInAncestry(states, logicalKey) { + for i := range states { + stack = append(stack, stackEntry{state: &states[i]}) + } + return stack, true + } e.stats.SubtreeDiamondSkips++ return stack, false } @@ -325,6 +334,17 @@ func (e *Explorer) enqueueWithMarker( return stack, true } +func hasLogicalKeyInAncestry(states []StateNode, key LogicalStateKey) bool { + for i := range states { + for parent := states[i].parent; parent != nil; parent = parent.parent { + if parent.LogicalKey() == key { + return true + } + } + } + return false +} + func (e *Explorer) subtreeCompletionEnabled() bool { if e.Config == nil { return true diff --git a/pkg/tracecheck/explore_stack_test.go b/pkg/tracecheck/explore_stack_test.go index 3b4f039..cc76798 100644 --- a/pkg/tracecheck/explore_stack_test.go +++ b/pkg/tracecheck/explore_stack_test.go @@ -55,6 +55,35 @@ func TestEnqueueStatesSkipsCompletedLogicalState(t *testing.T) { assert.Equal(t, 1, explorer.stats.SubtreeCompletionSkips) } +func TestEnqueueStatesSkipsInProgressDiamondState(t *testing.T) { + explorer := &Explorer{stats: NewExploreStats()} + tracker := newSubtreeTracker() + + state := testState(testPending("controller-a", "default", "obj-a")) + tracker.markInProgress(state.LogicalKey()) + + stack, enqueued := explorer.enqueueStates(nil, tracker, []StateNode{state}, true) + assert.False(t, enqueued) + assert.Empty(t, stack) + assert.Equal(t, 1, explorer.stats.SubtreeDiamondSkips) +} + +func TestEnqueueStatesAllowsInProgressCycleState(t *testing.T) { + explorer := &Explorer{stats: NewExploreStats()} + tracker := newSubtreeTracker() + + ancestor := testState(testPending("controller-a", "default", "obj-a")) + cycle := ancestor.Clone() + cycle.parent = &ancestor + tracker.markInProgress(ancestor.LogicalKey()) + + stack, enqueued := explorer.enqueueStates(nil, tracker, []StateNode{cycle}, true) + assert.True(t, enqueued) + assert.Len(t, stack, 1) + assert.False(t, stack[0].isMarker()) + assert.Equal(t, 0, explorer.stats.SubtreeDiamondSkips) +} + func TestEnqueueStaleViewsWithSubtreeCompletion(t *testing.T) { explorer := &Explorer{stats: NewExploreStats()} tracker := newStaleViewTracker() From dd5f531bf62f1d4238f9c6912a5652a7ba817dd0 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Thu, 26 Feb 2026 07:03:58 -0800 Subject: [PATCH 28/29] fix finalizer controller to go through with actual deletion --- pkg/tracecheck/explorebuilder.go | 13 +-- pkg/tracecheck/trigger.go | 22 ++++- pkg/tracecheck/trigger_test.go | 93 +++++++++++++++++++ .../controller/sleeve_finalizer_controller.go | 57 ++++++++---- .../sleeve_finalizer_controller_unit_test.go | 57 ++++++++++++ 5 files changed, 212 insertions(+), 30 deletions(-) create mode 100644 sleevectrl/pkg/controller/sleeve_finalizer_controller_unit_test.go diff --git a/pkg/tracecheck/explorebuilder.go b/pkg/tracecheck/explorebuilder.go index 48aa8bf..097e451 100644 --- a/pkg/tracecheck/explorebuilder.go +++ b/pkg/tracecheck/explorebuilder.go @@ -10,7 +10,6 @@ import ( "github.com/tgoodwin/kamera/pkg/event" "github.com/tgoodwin/kamera/pkg/replay" "github.com/tgoodwin/kamera/pkg/snapshot" - "github.com/tgoodwin/kamera/pkg/tracegen" "github.com/tgoodwin/kamera/pkg/util" "github.com/tgoodwin/kamera/sleevectrl/pkg/controller" appsv1 "k8s.io/api/apps/v1" @@ -631,18 +630,8 @@ func (b *ExplorerBuilder) instantiateCleanupReconciler(mgr *manager) *Reconciler fm, mgr, ) - wrappedClient := tracegen.New( - replayClient, - string(cleanupReconcilerID), - b.emitter, - tracegen.NewContextTracker( - string(cleanupReconcilerID), - b.emitter, - replay.FrameIDFromContext, - ), - ) r := &controller.FinalizerReconciler{ - Client: wrappedClient, + Client: replayClient, Recorder: mgr, } container := &ReconcilerContainer{ diff --git a/pkg/tracecheck/trigger.go b/pkg/tracecheck/trigger.go index 210cf09..0d4f03c 100644 --- a/pkg/tracecheck/trigger.go +++ b/pkg/tracecheck/trigger.go @@ -146,6 +146,21 @@ type TriggerManager struct { resolver Resolver } +// if an object has already been marked for deletion and undergoes a further update, +// the cleanupReconciler needs to hear about that update. +func shouldQueueCleanupReconcile(op event.OperationType, obj *unstructured.Unstructured) bool { + deletionTS := obj.GetDeletionTimestamp() + if deletionTS == nil || deletionTS.IsZero() { + return false + } + switch op { + case event.MARK_FOR_DELETION, event.UPDATE, event.PATCH, event.APPLY: + return true + default: + return false + } +} + func canonicalKindKeyFromGroupKind(gk schema.GroupKind) string { return util.CanonicalGroupKind(gk.Group, gk.Kind) } @@ -227,9 +242,14 @@ func (tm *TriggerManager) getTriggered(changes Changes) ([]PendingReconcile, err if effect.OpType == event.MARK_FOR_DELETION { deletionTS := objectVal.GetDeletionTimestamp() - if deletionTS.IsZero() { + if deletionTS == nil || deletionTS.IsZero() { panic("found object marked for deletion but with no deletion timestamp") } + } + + // Deletion-scope mutations need to wake the CleanupReconciler so it can + // react when finalizer state changes allow actual removal. + if shouldQueueCleanupReconcile(effect.OpType, objectVal) { // queue up the CleanupReconciler to handle the actual removal reconcileKey := fmt.Sprintf("%s:%s:%s", cleanupReconcilerID, nsName.Namespace, nsName.Name) uniqueReconciles[reconcileKey] = PendingReconcile{ diff --git a/pkg/tracecheck/trigger_test.go b/pkg/tracecheck/trigger_test.go index 88f5f9c..78b1cf6 100644 --- a/pkg/tracecheck/trigger_test.go +++ b/pkg/tracecheck/trigger_test.go @@ -3,6 +3,7 @@ package tracecheck import ( "sort" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/tgoodwin/kamera/pkg/event" @@ -49,6 +50,21 @@ func createTestObject(kind, namespace, name string, ownerRefs []metav1.OwnerRefe return obj } +func hasPendingReconcile( + reconciles []PendingReconcile, + reconcilerID ReconcilerID, + namespace, name string, +) bool { + for _, pr := range reconciles { + if pr.ReconcilerID == reconcilerID && + pr.Request.Namespace == namespace && + pr.Request.Name == name { + return true + } + } + return false +} + // Helper to sort PendingReconciles for stable comparison func sortPendingReconciles(reconciles []PendingReconcile) []PendingReconcile { sort.Slice(reconciles, func(i, j int) bool { @@ -635,3 +651,80 @@ func TestGetTriggeredWithHashResolutionFailure(t *testing.T) { _, err := tm.getTriggered(changes) assert.NotNil(t, err) } + +func TestGetTriggeredDeleteScopeMutationsQueueCleanupReconciler(t *testing.T) { + ops := []event.OperationType{event.UPDATE, event.PATCH, event.APPLY} + for _, op := range ops { + t.Run(string(op), func(t *testing.T) { + podKind := canonical("", "Pod") + owners := PrimariesByKind{ + podKind: util.NewSet[ReconcilerID]("podController"), + } + + obj := createTestObject("Pod", "default", "pod-1", nil) + deletionTime := metav1.NewTime(time.Date(2026, time.January, 1, 0, 0, 0, 0, time.UTC)) + obj.SetDeletionTimestamp(&deletionTime) + obj.SetFinalizers([]string{"example.com/finalizer"}) + + hash := snapshot.NewDefaultHash("pod-delete-scope-hash-" + string(op)) + resolver := &mockHashResolver{ + objects: map[snapshot.VersionHash]*unstructured.Unstructured{ + hash: obj, + }, + } + + tm := &TriggerManager{ + owners: owners, + watchers: make(WatchRegistrations), + resolver: resolver, + } + + triggered, err := tm.getTriggered(Changes{ + Effects: []Effect{ + { + OpType: op, + Key: compositeKey("Pod", "default", "pod-1", "pod-1"), + Version: hash, + }, + }, + }) + assert.NoError(t, err) + assert.True(t, hasPendingReconcile(triggered, cleanupReconcilerID, "default", "pod-1")) + assert.True(t, hasPendingReconcile(triggered, "podController", "default", "pod-1")) + }) + } +} + +func TestGetTriggeredUpdateWithoutDeletionTimestampDoesNotQueueCleanup(t *testing.T) { + podKind := canonical("", "Pod") + owners := PrimariesByKind{ + podKind: util.NewSet[ReconcilerID]("podController"), + } + + obj := createTestObject("Pod", "default", "pod-1", nil) + hash := snapshot.NewDefaultHash("pod-update-no-delete-ts") + resolver := &mockHashResolver{ + objects: map[snapshot.VersionHash]*unstructured.Unstructured{ + hash: obj, + }, + } + + tm := &TriggerManager{ + owners: owners, + watchers: make(WatchRegistrations), + resolver: resolver, + } + + triggered, err := tm.getTriggered(Changes{ + Effects: []Effect{ + { + OpType: event.UPDATE, + Key: compositeKey("Pod", "default", "pod-1", "pod-1"), + Version: hash, + }, + }, + }) + assert.NoError(t, err) + assert.False(t, hasPendingReconcile(triggered, cleanupReconcilerID, "default", "pod-1")) + assert.True(t, hasPendingReconcile(triggered, "podController", "default", "pod-1")) +} diff --git a/sleevectrl/pkg/controller/sleeve_finalizer_controller.go b/sleevectrl/pkg/controller/sleeve_finalizer_controller.go index 89dd83e..1be8c3a 100644 --- a/sleevectrl/pkg/controller/sleeve_finalizer_controller.go +++ b/sleevectrl/pkg/controller/sleeve_finalizer_controller.go @@ -4,13 +4,11 @@ import ( "context" "errors" "fmt" - "strings" "github.com/samber/lo" - "github.com/tgoodwin/kamera/pkg/event" "github.com/tgoodwin/kamera/pkg/replay" "github.com/tgoodwin/kamera/pkg/tag" - "github.com/tgoodwin/kamera/pkg/tracegen" + "github.com/tgoodwin/kamera/pkg/util" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" @@ -18,10 +16,32 @@ import ( ) type FinalizerReconciler struct { - *tracegen.Client + Client client.Client Recorder replay.EffectRecorder } +func cleanupEligibleForRemoval(obj *unstructured.Unstructured) bool { + deletionTS := obj.GetDeletionTimestamp() + if deletionTS == nil || deletionTS.IsZero() { + return false + } + return len(obj.GetFinalizers()) == 0 +} + +func setObjectTypeMetaFromCanonicalKind(obj *unstructured.Unstructured, canonicalKind string) error { + gk := util.ParseGroupKind(canonicalKind) + if gk.Kind == "" { + return fmt.Errorf("canonical kind %q has empty Kind", canonicalKind) + } + obj.SetKind(gk.Kind) + if gk.Group == "" { + obj.SetAPIVersion("v1") + return nil + } + obj.SetAPIVersion(gk.Group + "/v1") + return nil +} + // Reconcile handler for tracked objects func (r *FinalizerReconciler) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { logger := log.FromContext(ctx) @@ -31,14 +51,11 @@ func (r *FinalizerReconciler) Reconcile(ctx context.Context, req reconcile.Reque if !ok { return reconcile.Result{}, errors.New("no kind in context") } - // The context stores canonical kind like "core/Pod" or "apps/Deployment". - // We need to extract just the Kind part (after the /) for SetKind. - kind := canonicalKind - if idx := strings.LastIndex(canonicalKind, "/"); idx >= 0 { - kind = canonicalKind[idx+1:] + if err := setObjectTypeMetaFromCanonicalKind(obj, canonicalKind); err != nil { + return reconcile.Result{}, err } - obj.SetKind(kind) - if err := r.Get(ctx, req.NamespacedName, obj); err != nil { + kind := obj.GetKind() + if err := r.Client.Get(ctx, req.NamespacedName, obj); err != nil { // If the object is not found, it's already been deleted - nothing to clean up if client.IgnoreNotFound(err) == nil { logger.V(1).Info("object already deleted, nothing to clean up") @@ -59,14 +76,22 @@ func (r *FinalizerReconciler) Reconcile(ctx context.Context, req reconcile.Reque // in simulation/replay, need to remove the object from state if r.Recorder != nil { + if !cleanupEligibleForRemoval(obj) { + logger.V(1).Info("object still blocked by finalizers, skipping REMOVE") + return reconcile.Result{}, nil + } // simulation mode logger.V(1).Info("recording REMOVE effect") - if err := r.Recorder.RecordEffect(ctx, obj, event.REMOVE, nil); err != nil { + remover, ok := r.Client.(interface { + Remove(context.Context, client.Object) error + }) + if !ok { + return reconcile.Result{}, fmt.Errorf("cleanup client does not support Remove") + } + if err := remover.Remove(ctx, obj); err != nil { logger.Error(err, "recording effect") return reconcile.Result{}, fmt.Errorf("recording effect: %w", err) } - // emit the event - r.LogOperation(ctx, obj, event.REMOVE) return reconcile.Result{}, nil } @@ -76,12 +101,10 @@ func (r *FinalizerReconciler) Reconcile(ctx context.Context, req reconcile.Reque // Remove our finalizer to allow actual deletion by the APIServer obj.SetFinalizers(lo.Without(obj.GetFinalizers(), tag.SleeveFinalizer)) // Update object to remove finalizer and trigger removal - if err := r.Update(ctx, obj); err != nil { + if err := r.Client.Update(ctx, obj); err != nil { logger.Error(err, "failed to update object") return reconcile.Result{}, fmt.Errorf("failed to update object: %w", err) } - logger.V(2).Info("Emitting REMOVE event") - r.Client.LogOperation(ctx, obj, event.REMOVE) logger.V(1).Info("Finalizer removed") } } diff --git a/sleevectrl/pkg/controller/sleeve_finalizer_controller_unit_test.go b/sleevectrl/pkg/controller/sleeve_finalizer_controller_unit_test.go new file mode 100644 index 0000000..19b1c54 --- /dev/null +++ b/sleevectrl/pkg/controller/sleeve_finalizer_controller_unit_test.go @@ -0,0 +1,57 @@ +package controller + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +func makeDeletionCandidate(finalizers []string, withDeletionTimestamp bool) *unstructured.Unstructured { + obj := &unstructured.Unstructured{} + obj.SetAPIVersion("v1") + obj.SetKind("Pod") + obj.SetNamespace("default") + obj.SetName("pod-1") + obj.SetFinalizers(finalizers) + if withDeletionTimestamp { + ts := metav1.NewTime(time.Date(2026, time.January, 1, 0, 0, 0, 0, time.UTC)) + obj.SetDeletionTimestamp(&ts) + } + return obj +} + +func TestCleanupEligibleForRemovalRequiresDeletionTimestamp(t *testing.T) { + obj := makeDeletionCandidate(nil, false) + assert.False(t, cleanupEligibleForRemoval(obj)) +} + +func TestCleanupEligibleForRemovalRequiresNoFinalizers(t *testing.T) { + obj := makeDeletionCandidate([]string{"example.com/finalizer"}, true) + assert.False(t, cleanupEligibleForRemoval(obj)) +} + +func TestCleanupEligibleForRemovalTrueWhenDeleteTimestampAndNoFinalizers(t *testing.T) { + obj := makeDeletionCandidate(nil, true) + assert.True(t, cleanupEligibleForRemoval(obj)) +} + +func TestSetObjectTypeMetaFromCanonicalKind_CustomGroup(t *testing.T) { + obj := &unstructured.Unstructured{} + + err := setObjectTypeMetaFromCanonicalKind(obj, "example.promise.syntasso.io/EasyApp") + assert.NoError(t, err) + assert.Equal(t, "EasyApp", obj.GetKind()) + assert.Equal(t, "example.promise.syntasso.io/v1", obj.GetAPIVersion()) +} + +func TestSetObjectTypeMetaFromCanonicalKind_CoreGroup(t *testing.T) { + obj := &unstructured.Unstructured{} + + err := setObjectTypeMetaFromCanonicalKind(obj, "core/Pod") + assert.NoError(t, err) + assert.Equal(t, "Pod", obj.GetKind()) + assert.Equal(t, "v1", obj.GetAPIVersion()) +} From 084b503824086d3cc4a399f69f8a1279011d6395 Mon Sep 17 00:00:00 2001 From: Tim Goodwin Date: Thu, 26 Feb 2026 07:15:50 -0800 Subject: [PATCH 29/29] create dedicated CleanupStrategy implementation to reduce coupling --- pkg/tracecheck/explorebuilder.go | 7 +- pkg/tracecheck/reconciler.go | 50 +++++++---- .../reconciler_cleanup_strategy_test.go | 87 +++++++++++++++++++ 3 files changed, 125 insertions(+), 19 deletions(-) create mode 100644 pkg/tracecheck/reconciler_cleanup_strategy_test.go diff --git a/pkg/tracecheck/explorebuilder.go b/pkg/tracecheck/explorebuilder.go index 097e451..3e08ce3 100644 --- a/pkg/tracecheck/explorebuilder.go +++ b/pkg/tracecheck/explorebuilder.go @@ -635,8 +635,11 @@ func (b *ExplorerBuilder) instantiateCleanupReconciler(mgr *manager) *Reconciler Recorder: mgr, } container := &ReconcilerContainer{ - Name: cleanupReconcilerID, - Strategy: &ControllerRuntimeStrategy{Reconciler: r, frameInserter: fm, name: cleanupReconcilerID, effectReader: mgr}, + Name: cleanupReconcilerID, + Strategy: newCleanupRuntimeStrategy( + &ControllerRuntimeStrategy{Reconciler: r, frameInserter: fm, name: cleanupReconcilerID, effectReader: mgr}, + fm, + ), effectReader: mgr, versionManager: mgr, } diff --git a/pkg/tracecheck/reconciler.go b/pkg/tracecheck/reconciler.go index 18ebdf2..3716351 100644 --- a/pkg/tracecheck/reconciler.go +++ b/pkg/tracecheck/reconciler.go @@ -30,6 +30,10 @@ type frameInserter interface { InsertCacheFrame(id string, data replay.CacheFrame) } +type frameReader interface { + GetCacheFrame(id string) (replay.CacheFrame, error) +} + type Strategy interface { PrepareState(ctx context.Context, state []runtime.Object) (context.Context, func(), error) ReconcileAtState(ctx context.Context, name types.NamespacedName) (reconcile.Result, error) @@ -61,27 +65,39 @@ func (s *ControllerRuntimeStrategy) PrepareState(ctx context.Context, state []ru } func (s *ControllerRuntimeStrategy) ReconcileAtState(ctx context.Context, name types.NamespacedName) (reconcile.Result, error) { - // our cleanup reconciler implementation needs to know what kind of object it is reconciling - // as reconcile.Request is only namespace/name. so we inject it through the context. - // TODO factor this cleanup-specific stuff out into a dedicated strategy - if s.name == cleanupReconcilerID { - frameID := replay.FrameIDFromContext(ctx) - frameData, err := s.frameInserter.(*replay.FrameManager).GetCacheFrame(frameID) - if err != nil { - return reconcile.Result{}, err - } - for kind, objs := range frameData { - for nn := range objs { - if nn.Name == name.Name && nn.Namespace == name.Namespace { - ctx = context.WithValue(ctx, tag.CleanupKindKey{}, kind) - } - } - } - } req := reconcile.Request{NamespacedName: name} return s.Reconciler.Reconcile(ctx, req) } +type cleanupRuntimeStrategy struct { + *ControllerRuntimeStrategy + frameReader frameReader +} + +func newCleanupRuntimeStrategy(base *ControllerRuntimeStrategy, frameReader frameReader) *cleanupRuntimeStrategy { + return &cleanupRuntimeStrategy{ + ControllerRuntimeStrategy: base, + frameReader: frameReader, + } +} + +func (s *cleanupRuntimeStrategy) ReconcileAtState(ctx context.Context, name types.NamespacedName) (reconcile.Result, error) { + frameID := replay.FrameIDFromContext(ctx) + frameData, err := s.frameReader.GetCacheFrame(frameID) + if err != nil { + return reconcile.Result{}, err + } + + for kind, objs := range frameData { + if _, ok := objs[name]; ok { + ctx = context.WithValue(ctx, tag.CleanupKindKey{}, kind) + break + } + } + + return s.ControllerRuntimeStrategy.ReconcileAtState(ctx, name) +} + // runtimeObjectsToCacheFrame converts a slice of runtime objects into a replay cache frame. func runtimeObjectsToCacheFrame(objects []runtime.Object, scheme *runtime.Scheme) replay.CacheFrame { out := make(replay.CacheFrame) diff --git a/pkg/tracecheck/reconciler_cleanup_strategy_test.go b/pkg/tracecheck/reconciler_cleanup_strategy_test.go new file mode 100644 index 0000000..205ad34 --- /dev/null +++ b/pkg/tracecheck/reconciler_cleanup_strategy_test.go @@ -0,0 +1,87 @@ +package tracecheck + +import ( + "context" + "testing" + + "github.com/tgoodwin/kamera/pkg/replay" + "github.com/tgoodwin/kamera/pkg/tag" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +type cleanupKindCaptureReconciler struct { + gotKind any +} + +func (r *cleanupKindCaptureReconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + r.gotKind = ctx.Value(tag.CleanupKindKey{}) + return reconcile.Result{}, nil +} + +type noopEffects struct{} + +func (noopEffects) GetEffects(_ context.Context) (Changes, error) { return Changes{}, nil } + +func TestControllerRuntimeStrategy_DoesNotInjectCleanupKind(t *testing.T) { + fm := replay.NewFrameManager(nil) + strategy := &ControllerRuntimeStrategy{ + Reconciler: &cleanupKindCaptureReconciler{}, + frameInserter: fm, + effectReader: noopEffects{}, + name: cleanupReconcilerID, + scheme: runtime.NewScheme(), + } + + frameID := "frame-1" + reqName := types.NamespacedName{Namespace: "default", Name: "obj-1"} + fm.InsertCacheFrame(frameID, replay.CacheFrame{ + "apps/Deployment": { + reqName: &unstructured.Unstructured{}, + }, + }) + + ctx := replay.WithFrameID(context.Background(), frameID) + _, err := strategy.ReconcileAtState(ctx, reqName) + if err != nil { + t.Fatalf("reconcile: %v", err) + } + + r := strategy.Reconciler.(*cleanupKindCaptureReconciler) + if r.gotKind != nil { + t.Fatalf("expected no cleanup kind to be injected by base strategy, got %v", r.gotKind) + } +} + +func TestCleanupRuntimeStrategy_InjectsCleanupKind(t *testing.T) { + fm := replay.NewFrameManager(nil) + base := &ControllerRuntimeStrategy{ + Reconciler: &cleanupKindCaptureReconciler{}, + frameInserter: fm, + effectReader: noopEffects{}, + name: cleanupReconcilerID, + scheme: runtime.NewScheme(), + } + strategy := newCleanupRuntimeStrategy(base, fm) + + frameID := "frame-2" + reqName := types.NamespacedName{Namespace: "default", Name: "obj-2"} + fm.InsertCacheFrame(frameID, replay.CacheFrame{ + "apps/Deployment": { + reqName: &unstructured.Unstructured{}, + }, + }) + + ctx := replay.WithFrameID(context.Background(), frameID) + _, err := strategy.ReconcileAtState(ctx, reqName) + if err != nil { + t.Fatalf("reconcile: %v", err) + } + + r := base.Reconciler.(*cleanupKindCaptureReconciler) + if r.gotKind != "apps/Deployment" { + t.Fatalf("expected cleanup kind apps/Deployment, got %v", r.gotKind) + } +}