From 25d06dd13798737ecdfdf4864746a70c361703cf Mon Sep 17 00:00:00 2001 From: Moritz Johner Date: Mon, 5 Jan 2026 21:10:14 +0100 Subject: [PATCH 1/3] feat: precompute + cache metadata api response Signed-off-by: Moritz Johner --- internal/api/handlers/metadata_handler.go | 27 +++- internal/api/handlers/register.go | 3 +- internal/api/metadata_cache.go | 169 ++++++++++++++++++++++ internal/apiserver/routes.go | 1 + internal/apiserver/server.go | 35 ++++- 5 files changed, 232 insertions(+), 3 deletions(-) create mode 100644 internal/api/metadata_cache.go diff --git a/internal/api/handlers/metadata_handler.go b/internal/api/handlers/metadata_handler.go index 6cbb500..d5f10cc 100644 --- a/internal/api/handlers/metadata_handler.go +++ b/internal/api/handlers/metadata_handler.go @@ -15,14 +15,17 @@ import ( // MetadataHandler handles /v1/metadata requests type MetadataHandler struct { queryExecutor api.QueryExecutor + metadataCache *api.MetadataCache logger *logging.Logger tracer trace.Tracer } // NewMetadataHandler creates a new metadata handler -func NewMetadataHandler(queryExecutor api.QueryExecutor, logger *logging.Logger, tracer trace.Tracer) *MetadataHandler { +// metadataCache is optional - if nil, queries will go directly to the executor +func NewMetadataHandler(queryExecutor api.QueryExecutor, metadataCache *api.MetadataCache, logger *logging.Logger, tracer trace.Tracer) *MetadataHandler { return &MetadataHandler{ queryExecutor: queryExecutor, + metadataCache: metadataCache, logger: logger, tracer: tracer, } @@ -55,6 +58,27 @@ func (mh *MetadataHandler) Handle(w http.ResponseWriter, r *http.Request) { startTimeNs := startTime * 1e9 endTimeNs := endTime * 1e9 + // If cache is available and no time filtering requested, use cache + // Cache contains all metadata regardless of time range + // Note: startTime=0 and endTime=current time means "all data" + useCache := mh.metadataCache != nil && startTime == 0 + + if useCache { + mh.logger.Debug("Using metadata cache for request") + cachedData, err := mh.metadataCache.Get() + if err == nil { + // Successfully got cached data - return it immediately + w.Header().Set("Content-Type", "application/json") + w.Header().Set("X-Cache", "HIT") + w.WriteHeader(http.StatusOK) + _ = api.WriteJSON(w, cachedData) + return + } + + // Cache failed - log and fall through to direct query + mh.logger.Warn("Metadata cache unavailable, falling back to direct query: %v", err) + } + // Try to use efficient metadata query if available var namespacesList, kindsList []string var minTime, maxTime int64 @@ -132,6 +156,7 @@ func (mh *MetadataHandler) Handle(w http.ResponseWriter, r *http.Request) { } w.Header().Set("Content-Type", "application/json") + w.Header().Set("X-Cache", "MISS") w.WriteHeader(http.StatusOK) _ = api.WriteJSON(w, response) } diff --git a/internal/api/handlers/register.go b/internal/api/handlers/register.go index 46c9b0c..102b668 100644 --- a/internal/api/handlers/register.go +++ b/internal/api/handlers/register.go @@ -18,6 +18,7 @@ func RegisterHandlers( querySource api.TimelineQuerySource, graphClient graph.Client, graphPipeline sync.Pipeline, + metadataCache *api.MetadataCache, logger *logging.Logger, tracer trace.Tracer, withMethod func(string, http.HandlerFunc) http.HandlerFunc, @@ -56,7 +57,7 @@ func RegisterHandlers( logger.Info("Metadata handler using STORAGE query executor") metadataExecutor = storageExecutor } - metadataHandler := NewMetadataHandler(metadataExecutor, logger, tracer) + metadataHandler := NewMetadataHandler(metadataExecutor, metadataCache, logger, tracer) router.HandleFunc("/v1/search", withMethod(http.MethodGet, searchHandler.Handle)) router.HandleFunc("/v1/timeline", withMethod(http.MethodGet, timelineHandler.Handle)) diff --git a/internal/api/metadata_cache.go b/internal/api/metadata_cache.go new file mode 100644 index 0000000..fac843a --- /dev/null +++ b/internal/api/metadata_cache.go @@ -0,0 +1,169 @@ +package api + +import ( + "context" + "sync" + "time" + + "github.com/moolen/spectre/internal/logging" + "github.com/moolen/spectre/internal/models" +) + +// MetadataCache provides fast in-memory access to cluster metadata +// It periodically refreshes the data from the query executor in the background +type MetadataCache struct { + executor QueryExecutor + logger *logging.Logger + refreshPeriod time.Duration + + mu sync.RWMutex + metadata *models.MetadataResponse + lastErr error + + stopCh chan struct{} + wg sync.WaitGroup +} + +// NewMetadataCache creates a new metadata cache +// refreshPeriod: how often to refresh the cache (e.g., 30 seconds) +func NewMetadataCache(executor QueryExecutor, logger *logging.Logger, refreshPeriod time.Duration) *MetadataCache { + return &MetadataCache{ + executor: executor, + logger: logger, + refreshPeriod: refreshPeriod, + stopCh: make(chan struct{}), + } +} + +// Start initializes the cache and starts the background refresh loop +// It performs an initial synchronous load before returning +func (mc *MetadataCache) Start(ctx context.Context) error { + mc.logger.Info("Starting metadata cache with refresh period: %v", mc.refreshPeriod) + + // Perform initial load synchronously + if err := mc.refresh(ctx); err != nil { + mc.logger.Error("Failed initial metadata cache load: %v", err) + return err + } + + mc.logger.Info("Metadata cache initialized successfully") + + // Start background refresh loop + mc.wg.Add(1) + go mc.refreshLoop() + + return nil +} + +// Stop gracefully stops the background refresh loop +func (mc *MetadataCache) Stop() { + mc.logger.Info("Stopping metadata cache") + close(mc.stopCh) + mc.wg.Wait() + mc.logger.Info("Metadata cache stopped") +} + +// Get returns the cached metadata +// If the cache is empty or has an error, it returns the error +func (mc *MetadataCache) Get() (*models.MetadataResponse, error) { + mc.mu.RLock() + defer mc.mu.RUnlock() + + if mc.lastErr != nil { + return nil, mc.lastErr + } + + if mc.metadata == nil { + return nil, ErrCacheNotReady + } + + // Return a copy to prevent mutation + result := &models.MetadataResponse{ + Namespaces: append([]string{}, mc.metadata.Namespaces...), + Kinds: append([]string{}, mc.metadata.Kinds...), + TimeRange: mc.metadata.TimeRange, + } + + return result, nil +} + +// refresh queries the executor and updates the cache +func (mc *MetadataCache) refresh(ctx context.Context) error { + start := time.Now() + + // Query all metadata (no time filtering - get everything) + // Use 0 for start and a far future time for end to get all data + startTimeNs := int64(0) + endTimeNs := time.Now().Add(24 * time.Hour).UnixNano() + + // Check if executor supports efficient metadata query + metadataExecutor, ok := mc.executor.(interface { + QueryDistinctMetadata(ctx context.Context, startTimeNs, endTimeNs int64) (namespaces []string, kinds []string, minTime int64, maxTime int64, err error) + }) + + if !ok { + mc.mu.Lock() + mc.lastErr = ErrMetadataQueryNotSupported + mc.mu.Unlock() + return ErrMetadataQueryNotSupported + } + + namespaces, kinds, minTime, maxTime, err := metadataExecutor.QueryDistinctMetadata(ctx, startTimeNs, endTimeNs) + if err != nil { + mc.logger.Error("Failed to refresh metadata cache: %v", err) + mc.mu.Lock() + mc.lastErr = err + mc.mu.Unlock() + return err + } + + elapsed := time.Since(start) + + // Update cache + mc.mu.Lock() + mc.metadata = &models.MetadataResponse{ + Namespaces: namespaces, + Kinds: kinds, + TimeRange: models.TimeRangeInfo{ + Earliest: minTime / 1e9, + Latest: maxTime / 1e9, + }, + } + mc.lastErr = nil + mc.mu.Unlock() + + mc.logger.DebugWithFields("Metadata cache refreshed", + logging.Field("namespaces", len(namespaces)), + logging.Field("kinds", len(kinds)), + logging.Field("duration_ms", elapsed.Milliseconds())) + + return nil +} + +// refreshLoop runs in the background and periodically refreshes the cache +func (mc *MetadataCache) refreshLoop() { + defer mc.wg.Done() + + ticker := time.NewTicker(mc.refreshPeriod) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + if err := mc.refresh(ctx); err != nil { + mc.logger.Error("Background metadata cache refresh failed: %v", err) + } + cancel() + + case <-mc.stopCh: + return + } + } +} + +// Errors +var ( + ErrCacheNotReady = &APIError{Code: "CACHE_NOT_READY", Message: "Metadata cache is not ready"} + ErrMetadataQueryNotSupported = &APIError{Code: "NOT_SUPPORTED", Message: "Executor does not support metadata queries"} +) diff --git a/internal/apiserver/routes.go b/internal/apiserver/routes.go index d5675d3..8140fb1 100644 --- a/internal/apiserver/routes.go +++ b/internal/apiserver/routes.go @@ -57,6 +57,7 @@ func (s *Server) registerHTTPHandlers() { s.querySource, s.graphClient, s.graphPipeline, + s.metadataCache, s.logger, tracer, s.withMethod, diff --git a/internal/apiserver/server.go b/internal/apiserver/server.go index 7074215..106e706 100644 --- a/internal/apiserver/server.go +++ b/internal/apiserver/server.go @@ -36,7 +36,8 @@ type Server struct { graphExecutor api.QueryExecutor // Graph-based query executor querySource api.TimelineQuerySource // Which executor to use for timeline queries graphClient graph.Client - graphPipeline sync.Pipeline // Graph sync pipeline for imports + graphPipeline sync.Pipeline // Graph sync pipeline for imports + metadataCache *api.MetadataCache // In-memory metadata cache for fast responses router *http.ServeMux readinessChecker ReadinessChecker tracingProvider interface { @@ -92,6 +93,21 @@ func NewWithStorageGraphAndPipeline( tracingProvider: tracingProvider, } + // Create metadata cache if we have a query executor + // Use graph executor if available (more efficient), otherwise storage executor + var metadataExecutor api.QueryExecutor + if graphExecutor != nil { + metadataExecutor = graphExecutor + } else { + metadataExecutor = storageExecutor + } + + if metadataExecutor != nil { + // Create cache with 30-second refresh period + s.metadataCache = api.NewMetadataCache(metadataExecutor, s.logger, 30*time.Second) + s.logger.Info("Metadata cache created (will initialize on server start)") + } + // Register all routes and handlers s.registerHandlers() @@ -129,6 +145,18 @@ func (s *Server) Start(ctx context.Context) error { default: } + // Start metadata cache if available + if s.metadataCache != nil { + s.logger.Info("Initializing metadata cache...") + if err := s.metadataCache.Start(ctx); err != nil { + s.logger.Error("Failed to start metadata cache: %v", err) + // Don't fail server startup - cache is optional optimization + // Handlers will fall back to direct queries + } else { + s.logger.Info("Metadata cache started successfully") + } + } + // Start HTTP server in a goroutine go func() { if err := s.server.ListenAndServe(); err != nil && err != http.ErrServerClosed { @@ -145,6 +173,11 @@ func (s *Server) Start(ctx context.Context) error { func (s *Server) Stop(ctx context.Context) error { s.logger.Info("Stopping API server...") + // Stop metadata cache if running + if s.metadataCache != nil { + s.metadataCache.Stop() + } + // Stop HTTP server done := make(chan error, 1) go func() { From 394bca50783609f6a90d9da85b07c0a3435d48a0 Mon Sep 17 00:00:00 2001 From: Moritz Johner Date: Mon, 5 Jan 2026 23:53:59 +0100 Subject: [PATCH 2/3] feat: add root-cause event significance + json diff Signed-off-by: Moritz Johner --- internal/analysis/analyzer.go | 55 ++ internal/analysis/event_significance.go | 272 ++++++++ internal/analysis/event_significance_test.go | 660 +++++++++++++++++++ internal/analysis/json_diff.go | 255 +++++++ internal/analysis/json_diff_test.go | 566 ++++++++++++++++ internal/analysis/types.go | 29 +- internal/api/handlers/root_cause_handler.go | 15 + ui/src/components/RootCauseView.tsx | 238 ++++++- ui/src/services/rootCauseService.ts | 13 +- ui/src/types/rootCause.ts | 32 +- 10 files changed, 2111 insertions(+), 24 deletions(-) create mode 100644 internal/analysis/event_significance.go create mode 100644 internal/analysis/event_significance_test.go create mode 100644 internal/analysis/json_diff.go create mode 100644 internal/analysis/json_diff_test.go diff --git a/internal/analysis/analyzer.go b/internal/analysis/analyzer.go index a943cf3..3c6a21b 100644 --- a/internal/analysis/analyzer.go +++ b/internal/analysis/analyzer.go @@ -23,6 +23,16 @@ func NewRootCauseAnalyzer(graphClient graph.Client) *RootCauseAnalyzer { } } +// ResponseFormat specifies the format for the API response +type ResponseFormat string + +const ( + // FormatLegacy uses base64-encoded full resource data (backward compatible) + FormatLegacy ResponseFormat = "legacy" + // FormatDiff uses diff-based format with significance scoring + FormatDiff ResponseFormat = "diff" +) + // AnalyzeInput defines input parameters for root cause analysis type AnalyzeInput struct { ResourceUID string @@ -30,6 +40,7 @@ type AnalyzeInput struct { LookbackNs int64 // Lookback window in nanoseconds (default: 10 minutes) MaxDepth int MinConfidence float64 + Format ResponseFormat // Response format: "legacy" or "diff" (default: "diff") } // Analyze performs root cause analysis using the causality-first approach. @@ -165,6 +176,29 @@ func (a *RootCauseAnalyzer) Analyze(ctx context.Context, input AnalyzeInput) (*R a.logger.Info("Analysis completed in %v - degraded=%v, symptom_only=%v, confidence=%.2f", totalDuration, quality.IsDegraded, quality.IsSymptomOnly, confidence.Score) + // Apply format-specific transformations + format := input.Format + if format == "" { + format = FormatDiff // Default to new format + } + + if format == FormatDiff { + a.logger.Debug("Applying diff format transformations") + // Extract error patterns from symptom for correlation + errorPatterns := ExtractErrorPatterns(symptom.ErrorMessage) + + // Apply significance scoring and diff conversion to all nodes + a.applyDiffFormat(&graph, time.Unix(0, input.FailureTimestamp), errorPatterns) + + // Also process root cause event + if rootCause.ChangeEvent.Data != nil { + rootCause.ChangeEvent.Significance = CalculateChangeEventSignificance( + &rootCause.ChangeEvent, true, time.Unix(0, input.FailureTimestamp), errorPatterns, + ) + ConvertSingleEventToDiff(&rootCause.ChangeEvent, nil, true) + } + } + return &RootCauseAnalysisV2{ Incident: IncidentAnalysis{ ObservedSymptom: *symptom, @@ -183,3 +217,24 @@ func (a *RootCauseAnalyzer) Analyze(ctx context.Context, input AnalyzeInput) (*R }, }, nil } + +// applyDiffFormat applies significance scoring and diff conversion to all graph nodes +func (a *RootCauseAnalyzer) applyDiffFormat(graph *CausalGraph, failureTime time.Time, errorPatterns []string) { + for i := range graph.Nodes { + node := &graph.Nodes[i] + isOnSpine := node.NodeType == "SPINE" + + // Score all events + ScoreEvents(node, isOnSpine, failureTime, errorPatterns) + + // Convert AllEvents to diff format + if len(node.AllEvents) > 0 { + node.AllEvents = ConvertEventsToDiffFormat(node.AllEvents, true) + } + + // Convert ChangeEvent to diff format + if node.ChangeEvent != nil && node.ChangeEvent.Data != nil { + ConvertSingleEventToDiff(node.ChangeEvent, nil, true) + } + } +} diff --git a/internal/analysis/event_significance.go b/internal/analysis/event_significance.go new file mode 100644 index 0000000..57fdd81 --- /dev/null +++ b/internal/analysis/event_significance.go @@ -0,0 +1,272 @@ +package analysis + +import ( + "strings" + "time" +) + +// ============================================================================ +// EVENT SIGNIFICANCE WEIGHTS +// ============================================================================ +// These weights determine the relative importance of different factors +// when calculating event significance scores. + +const ( + // EventWeightCausalSpine is the weight for events on the causal path. + // Events on the causal spine are most relevant for root cause analysis. + EventWeightCausalSpine = 0.15 + + // EventWeightConfigChange is the weight for spec/config modifications. + // Configuration changes are high-signal events. + EventWeightConfigChange = 0.35 + + // EventWeightStatusChange is the weight for status changes. + // Status changes are less impactful but still relevant. + EventWeightStatusChange = 0.05 + + // EventWeightTemporalProximity is the weight for temporal closeness to failure. + // Events closer to the failure are more likely to be relevant. + EventWeightTemporalProximity = 0.25 + + // EventWeightErrorCorrelation is the weight for error pattern matching. + // Events that correlate with the error message are important. + EventWeightErrorCorrelation = 0.15 + + // EventWeightEventType is the weight for event type significance. + // DELETE and CREATE events have higher impact than UPDATE. + EventWeightEventType = 0.05 +) + +// SignificantK8sEventReasons maps Kubernetes event reasons to their significance boosts. +// These represent well-known failure indicators in Kubernetes. +var SignificantK8sEventReasons = map[string]float64{ + // Critical failures + "FailedScheduling": 0.5, + "ImagePullBackOff": 0.5, + "CrashLoopBackOff": 0.5, + "OOMKilled": 0.5, + "Failed": 0.4, + "Unhealthy": 0.4, + "BackOff": 0.4, + "FailedMount": 0.4, + "FailedAttachVolume": 0.4, + "NodeNotReady": 0.4, + "NetworkNotReady": 0.4, + + // Resource issues + "Evicted": 0.3, + "FreeDiskSpaceFailed": 0.3, + "InsufficientMemory": 0.3, + "InsufficientCPU": 0.3, + + // Normal operational events with lower significance + "Killing": 0.2, + "Preempting": 0.2, + "SuccessfulCreate": 0.1, + "Scheduled": 0.1, + "Pulled": 0.05, + "Started": 0.05, + "Created": 0.05, +} + +// CalculateChangeEventSignificance scores a change event based on multiple factors. +// The score is a weighted combination of: +// - Causal spine position (35%): Is this event on the causal path? +// - Config change (25%): Does this event modify the resource spec? +// - Temporal proximity (20%): How close is this event to the failure time? +// - Error correlation (15%): Does this event correlate with error patterns? +// - Event type (5%): Is this a DELETE/CREATE vs UPDATE event? +func CalculateChangeEventSignificance( + event *ChangeEventInfo, + isOnCausalSpine bool, + failureTime time.Time, + errorPatterns []string, +) *EventSignificance { + score := 0.0 + reasons := []string{} + + // Factor 1: Causal spine position (35%) + if isOnCausalSpine { + score += EventWeightCausalSpine + reasons = append(reasons, "on causal path") + } + + // Factor 2: Config change (25%) + if event.ConfigChanged { + score += EventWeightConfigChange + reasons = append(reasons, "spec changed") + } else if event.StatusChanged { + // Status changes are less impactful but still relevant + score += EventWeightConfigChange * 0.15 + reasons = append(reasons, "status changed") + } + + // Factor 3: Temporal proximity (20%) + if !failureTime.IsZero() { + timeDelta := failureTime.Sub(event.Timestamp) + if timeDelta < 0 { + timeDelta = -timeDelta // Handle future events (shouldn't happen but be safe) + } + + if timeDelta < 5*time.Minute { + score += EventWeightTemporalProximity + reasons = append(reasons, "within 5min of failure") + } else if timeDelta < 30*time.Minute { + score += EventWeightTemporalProximity * 0.5 + reasons = append(reasons, "within 30min of failure") + } else if timeDelta < time.Hour { + score += EventWeightTemporalProximity * 0.2 + } + } + + // Factor 4: Error correlation (15%) + if len(errorPatterns) > 0 && event.Description != "" { + descLower := strings.ToLower(event.Description) + for _, pattern := range errorPatterns { + if strings.Contains(descLower, strings.ToLower(pattern)) { + score += EventWeightErrorCorrelation + reasons = append(reasons, "matches error pattern") + break + } + } + } + + // Factor 5: Event type significance (5%) + switch event.EventType { + case "DELETE": + score += EventWeightEventType + reasons = append(reasons, "resource deleted") + case "CREATE": + score += EventWeightEventType * 0.6 + reasons = append(reasons, "resource created") + } + + // Normalize score to [0, 1] + if score > 1.0 { + score = 1.0 + } + + return &EventSignificance{ + Score: score, + Reasons: reasons, + } +} + +// CalculateK8sEventSignificance scores a Kubernetes event based on its type, +// reason, and relation to the failure. +func CalculateK8sEventSignificance( + event *K8sEventInfo, + isOnCausalSpine bool, + failureTime time.Time, +) *EventSignificance { + score := 0.0 + reasons := []string{} + + // Warning events are more significant than Normal events + if event.Type == "Warning" { + score += 0.3 + reasons = append(reasons, "warning event") + } else if event.Type == "Error" { + score += 0.4 + reasons = append(reasons, "error event") + } + + // Check for known significant reasons + if boost, ok := SignificantK8sEventReasons[event.Reason]; ok { + score += boost + reasons = append(reasons, event.Reason) + } + + // Causal spine boost + if isOnCausalSpine { + score += 0.15 + reasons = append(reasons, "on causal path") + } + + // Temporal proximity boost + if !failureTime.IsZero() { + timeDelta := failureTime.Sub(event.Timestamp) + if timeDelta < 0 { + timeDelta = -timeDelta + } + + if timeDelta < 5*time.Minute { + score += 0.1 + } else if timeDelta < 30*time.Minute { + score += 0.05 + } + } + + // High event count indicates persistent issues + if event.Count > 5 { + score += 0.1 + reasons = append(reasons, "repeated event") + } + + // Normalize score to [0, 1] + if score > 1.0 { + score = 1.0 + } + + return &EventSignificance{ + Score: score, + Reasons: reasons, + } +} + +// ExtractErrorPatterns extracts keywords from an error message that can be used +// for correlation with events. +func ExtractErrorPatterns(errorMessage string) []string { + if errorMessage == "" { + return nil + } + + // Common error keywords to look for in events + keywords := []string{ + "image", "pull", "config", "configmap", "secret", "volume", "mount", + "permission", "timeout", "connection", "refused", "denied", + "memory", "cpu", "oom", "killed", "crash", "restart", + "unhealthy", "probe", "liveness", "readiness", + "schedule", "node", "taint", "affinity", + } + + errorLower := strings.ToLower(errorMessage) + var patterns []string + + for _, kw := range keywords { + if strings.Contains(errorLower, kw) { + patterns = append(patterns, kw) + } + } + + return patterns +} + +// ScoreEvents applies significance scoring to all events in a graph node. +// This is typically called during graph building when the format is "diff". +func ScoreEvents( + node *GraphNode, + isOnCausalSpine bool, + failureTime time.Time, + errorPatterns []string, +) { + // Score change events + if node.ChangeEvent != nil { + node.ChangeEvent.Significance = CalculateChangeEventSignificance( + node.ChangeEvent, isOnCausalSpine, failureTime, errorPatterns, + ) + } + + for i := range node.AllEvents { + node.AllEvents[i].Significance = CalculateChangeEventSignificance( + &node.AllEvents[i], isOnCausalSpine, failureTime, errorPatterns, + ) + } + + // Score K8s events + for i := range node.K8sEvents { + node.K8sEvents[i].Significance = CalculateK8sEventSignificance( + &node.K8sEvents[i], isOnCausalSpine, failureTime, + ) + } +} diff --git a/internal/analysis/event_significance_test.go b/internal/analysis/event_significance_test.go new file mode 100644 index 0000000..d8e3aec --- /dev/null +++ b/internal/analysis/event_significance_test.go @@ -0,0 +1,660 @@ +package analysis + +import ( + "testing" + "time" +) + +func TestCalculateChangeEventSignificance(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + event *ChangeEventInfo + isOnCausalSpine bool + failureTime time.Time + errorPatterns []string + minScore float64 + maxScore float64 + expectReasons []string + }{ + { + name: "config change on causal spine within 5 minutes", + event: &ChangeEventInfo{ + EventID: "1", + Timestamp: now.Add(-2 * time.Minute), + EventType: "UPDATE", + ConfigChanged: true, + }, + isOnCausalSpine: true, + failureTime: now, + errorPatterns: nil, + minScore: 0.7, + maxScore: 1.0, + expectReasons: []string{"on causal path", "spec changed", "within 5min of failure"}, + }, + { + name: "status change only", + event: &ChangeEventInfo{ + EventID: "2", + Timestamp: now.Add(-10 * time.Minute), + EventType: "UPDATE", + StatusChanged: true, + }, + isOnCausalSpine: false, + failureTime: now, + errorPatterns: nil, + minScore: 0.05, + maxScore: 0.3, + expectReasons: []string{"status changed"}, + }, + { + name: "DELETE event on spine", + event: &ChangeEventInfo{ + EventID: "3", + Timestamp: now.Add(-1 * time.Minute), + EventType: "DELETE", + }, + isOnCausalSpine: true, + failureTime: now, + errorPatterns: nil, + minScore: 0.3, + maxScore: 0.5, + expectReasons: []string{"on causal path", "within 5min of failure", "resource deleted"}, + }, + { + name: "CREATE event", + event: &ChangeEventInfo{ + EventID: "4", + Timestamp: now.Add(-3 * time.Minute), + EventType: "CREATE", + }, + isOnCausalSpine: false, + failureTime: now, + errorPatterns: nil, + minScore: 0.2, + maxScore: 0.4, + expectReasons: []string{"within 5min of failure", "resource created"}, + }, + { + name: "error pattern match", + event: &ChangeEventInfo{ + EventID: "5", + Timestamp: now.Add(-4 * time.Minute), + EventType: "UPDATE", + Description: "ImagePullBackOff: failed to pull image", + }, + isOnCausalSpine: false, + failureTime: now, + errorPatterns: []string{"image", "pull"}, + minScore: 0.3, + maxScore: 0.6, + expectReasons: []string{"within 5min of failure", "matches error pattern"}, + }, + { + name: "event within 30 minutes", + event: &ChangeEventInfo{ + EventID: "6", + Timestamp: now.Add(-20 * time.Minute), + EventType: "UPDATE", + ConfigChanged: true, + }, + isOnCausalSpine: false, + failureTime: now, + errorPatterns: nil, + minScore: 0.4, + maxScore: 0.6, + expectReasons: []string{"spec changed", "within 30min of failure"}, + }, + { + name: "event older than 1 hour", + event: &ChangeEventInfo{ + EventID: "7", + Timestamp: now.Add(-2 * time.Hour), + EventType: "UPDATE", + }, + isOnCausalSpine: false, + failureTime: now, + errorPatterns: nil, + minScore: 0.0, + maxScore: 0.1, + expectReasons: []string{}, + }, + { + name: "all factors combined - high score", + event: &ChangeEventInfo{ + EventID: "8", + Timestamp: now.Add(-1 * time.Minute), + EventType: "DELETE", + ConfigChanged: true, + Description: "Pod deleted due to OOM killed", + }, + isOnCausalSpine: true, + failureTime: now, + errorPatterns: []string{"oom", "killed"}, + minScore: 0.9, + maxScore: 1.0, + expectReasons: []string{"on causal path", "spec changed", "within 5min of failure", "matches error pattern", "resource deleted"}, + }, + { + name: "zero failure time", + event: &ChangeEventInfo{ + EventID: "9", + Timestamp: now, + EventType: "UPDATE", + ConfigChanged: true, + }, + isOnCausalSpine: true, + failureTime: time.Time{}, // zero time + errorPatterns: nil, + minScore: 0.4, + maxScore: 0.6, + expectReasons: []string{"on causal path", "spec changed"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := CalculateChangeEventSignificance( + tt.event, + tt.isOnCausalSpine, + tt.failureTime, + tt.errorPatterns, + ) + + if result == nil { + t.Fatal("expected non-nil result") + } + + if result.Score < tt.minScore || result.Score > tt.maxScore { + t.Errorf("score %.2f not in expected range [%.2f, %.2f]", result.Score, tt.minScore, tt.maxScore) + } + + for _, expectedReason := range tt.expectReasons { + found := false + for _, reason := range result.Reasons { + if reason == expectedReason { + found = true + break + } + } + if !found { + t.Errorf("expected reason %q not found in %v", expectedReason, result.Reasons) + } + } + }) + } +} + +func TestCalculateK8sEventSignificance(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + event *K8sEventInfo + isOnCausalSpine bool + failureTime time.Time + minScore float64 + maxScore float64 + expectReasons []string + }{ + { + name: "Warning event with CrashLoopBackOff", + event: &K8sEventInfo{ + EventID: "1", + Timestamp: now.Add(-2 * time.Minute), + Reason: "CrashLoopBackOff", + Type: "Warning", + Count: 3, + }, + isOnCausalSpine: true, + failureTime: now, + minScore: 0.9, + maxScore: 1.0, + expectReasons: []string{"warning event", "CrashLoopBackOff", "on causal path"}, + }, + { + name: "Warning event with ImagePullBackOff", + event: &K8sEventInfo{ + EventID: "2", + Timestamp: now.Add(-1 * time.Minute), + Reason: "ImagePullBackOff", + Type: "Warning", + Count: 5, + }, + isOnCausalSpine: false, + failureTime: now, + minScore: 0.8, + maxScore: 1.0, + expectReasons: []string{"warning event", "ImagePullBackOff"}, + }, + { + name: "Error event with OOMKilled", + event: &K8sEventInfo{ + EventID: "3", + Timestamp: now.Add(-30 * time.Second), + Reason: "OOMKilled", + Type: "Error", + Count: 1, + }, + isOnCausalSpine: true, + failureTime: now, + minScore: 0.9, + maxScore: 1.0, + expectReasons: []string{"error event", "OOMKilled", "on causal path"}, + }, + { + name: "Normal event with Scheduled", + event: &K8sEventInfo{ + EventID: "4", + Timestamp: now.Add(-5 * time.Minute), + Reason: "Scheduled", + Type: "Normal", + Count: 1, + }, + isOnCausalSpine: false, + failureTime: now, + minScore: 0.1, + maxScore: 0.3, + expectReasons: []string{"Scheduled"}, + }, + { + name: "High count event", + event: &K8sEventInfo{ + EventID: "5", + Timestamp: now.Add(-10 * time.Minute), + Reason: "BackOff", + Type: "Warning", + Count: 10, + }, + isOnCausalSpine: false, + failureTime: now, + minScore: 0.7, + maxScore: 1.0, + expectReasons: []string{"warning event", "BackOff", "repeated event"}, + }, + { + name: "FailedScheduling event", + event: &K8sEventInfo{ + EventID: "6", + Timestamp: now.Add(-3 * time.Minute), + Reason: "FailedScheduling", + Type: "Warning", + Count: 2, + }, + isOnCausalSpine: true, + failureTime: now, + minScore: 0.9, + maxScore: 1.0, + expectReasons: []string{"warning event", "FailedScheduling", "on causal path"}, + }, + { + name: "Unknown reason", + event: &K8sEventInfo{ + EventID: "7", + Timestamp: now.Add(-5 * time.Minute), + Reason: "UnknownReason", + Type: "Normal", + Count: 1, + }, + isOnCausalSpine: false, + failureTime: now, + minScore: 0.0, + maxScore: 0.2, + expectReasons: []string{}, + }, + { + name: "Event within 30 minutes", + event: &K8sEventInfo{ + EventID: "8", + Timestamp: now.Add(-20 * time.Minute), + Reason: "Unhealthy", + Type: "Warning", + Count: 3, + }, + isOnCausalSpine: false, + failureTime: now, + minScore: 0.7, + maxScore: 0.9, + expectReasons: []string{"warning event", "Unhealthy"}, + }, + { + name: "FailedMount event", + event: &K8sEventInfo{ + EventID: "9", + Timestamp: now.Add(-2 * time.Minute), + Reason: "FailedMount", + Type: "Warning", + Count: 2, + }, + isOnCausalSpine: false, + failureTime: now, + minScore: 0.7, + maxScore: 0.9, + expectReasons: []string{"warning event", "FailedMount"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := CalculateK8sEventSignificance( + tt.event, + tt.isOnCausalSpine, + tt.failureTime, + ) + + if result == nil { + t.Fatal("expected non-nil result") + } + + if result.Score < tt.minScore || result.Score > tt.maxScore { + t.Errorf("score %.2f not in expected range [%.2f, %.2f]", result.Score, tt.minScore, tt.maxScore) + } + + for _, expectedReason := range tt.expectReasons { + found := false + for _, reason := range result.Reasons { + if reason == expectedReason { + found = true + break + } + } + if !found { + t.Errorf("expected reason %q not found in %v", expectedReason, result.Reasons) + } + } + }) + } +} + +func TestExtractErrorPatterns(t *testing.T) { + tests := []struct { + name string + errorMessage string + expectPattern []string + expectNil bool + }{ + { + name: "empty message", + errorMessage: "", + expectPattern: nil, + expectNil: true, + }, + { + name: "image pull error", + errorMessage: "Failed to pull image: unauthorized", + expectPattern: []string{"image", "pull"}, + }, + { + name: "OOM killed", + errorMessage: "Container was OOM killed due to memory pressure", + expectPattern: []string{"memory", "oom", "killed"}, + }, + { + name: "config error", + errorMessage: "ConfigMap not found: app-config", + expectPattern: []string{"config", "configmap"}, + }, + { + name: "volume mount error", + errorMessage: "Failed to mount volume: permission denied", + expectPattern: []string{"volume", "mount", "permission", "denied"}, + }, + { + name: "scheduling error", + errorMessage: "0/3 nodes are available: node affinity mismatch", + expectPattern: []string{"node", "affinity"}, + }, + { + name: "connection error", + errorMessage: "Connection refused to service endpoint", + expectPattern: []string{"connection", "refused"}, + }, + { + name: "probe failure", + errorMessage: "Liveness probe failed: unhealthy", + expectPattern: []string{"liveness", "probe", "unhealthy"}, + }, + { + name: "crash loop", + errorMessage: "Back-off restarting crashed container", + expectPattern: []string{"crash", "restart"}, + }, + { + name: "secret error", + errorMessage: "Secret not found: api-credentials", + expectPattern: []string{"secret"}, + }, + { + name: "timeout error", + errorMessage: "Request timeout after 30s", + expectPattern: []string{"timeout"}, + }, + { + name: "case insensitive", + errorMessage: "IMAGE PULL failed, CONFIG error", + expectPattern: []string{"image", "pull", "config"}, + }, + { + name: "no matching patterns", + errorMessage: "Unknown error occurred", + expectPattern: []string{}, + expectNil: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ExtractErrorPatterns(tt.errorMessage) + + if tt.expectNil { + if result != nil { + t.Errorf("expected nil, got %v", result) + } + return + } + + for _, expected := range tt.expectPattern { + found := false + for _, pattern := range result { + if pattern == expected { + found = true + break + } + } + if !found { + t.Errorf("expected pattern %q not found in %v", expected, result) + } + } + }) + } +} + +func TestScoreEvents(t *testing.T) { + now := time.Now() + + tests := []struct { + name string + node *GraphNode + isOnCausalSpine bool + failureTime time.Time + errorPatterns []string + }{ + { + name: "scores all event types", + node: &GraphNode{ + ID: "test-node", + ChangeEvent: &ChangeEventInfo{ + EventID: "change-1", + Timestamp: now.Add(-2 * time.Minute), + EventType: "UPDATE", + ConfigChanged: true, + }, + AllEvents: []ChangeEventInfo{ + { + EventID: "change-2", + Timestamp: now.Add(-5 * time.Minute), + EventType: "CREATE", + StatusChanged: true, + }, + }, + K8sEvents: []K8sEventInfo{ + { + EventID: "k8s-1", + Timestamp: now.Add(-1 * time.Minute), + Reason: "BackOff", + Type: "Warning", + Count: 3, + }, + }, + }, + isOnCausalSpine: true, + failureTime: now, + errorPatterns: []string{"error"}, + }, + { + name: "handles nil ChangeEvent", + node: &GraphNode{ + ID: "test-node", + ChangeEvent: nil, + AllEvents: []ChangeEventInfo{ + { + EventID: "change-1", + Timestamp: now, + EventType: "UPDATE", + }, + }, + }, + isOnCausalSpine: false, + failureTime: now, + errorPatterns: nil, + }, + { + name: "handles empty events", + node: &GraphNode{ + ID: "test-node", + ChangeEvent: nil, + AllEvents: nil, + K8sEvents: nil, + }, + isOnCausalSpine: false, + failureTime: now, + errorPatterns: nil, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Should not panic + ScoreEvents(tt.node, tt.isOnCausalSpine, tt.failureTime, tt.errorPatterns) + + // Verify significance is set + if tt.node.ChangeEvent != nil { + if tt.node.ChangeEvent.Significance == nil { + t.Error("expected ChangeEvent.Significance to be set") + } + } + + for i, evt := range tt.node.AllEvents { + if evt.Significance == nil { + t.Errorf("expected AllEvents[%d].Significance to be set", i) + } + } + + for i, evt := range tt.node.K8sEvents { + if evt.Significance == nil { + t.Errorf("expected K8sEvents[%d].Significance to be set", i) + } + } + }) + } +} + +func TestSignificantK8sEventReasons(t *testing.T) { + // Test that all expected reasons are in the map with reasonable values + criticalReasons := []string{ + "FailedScheduling", + "ImagePullBackOff", + "CrashLoopBackOff", + "OOMKilled", + } + + for _, reason := range criticalReasons { + boost, ok := SignificantK8sEventReasons[reason] + if !ok { + t.Errorf("critical reason %q not found in SignificantK8sEventReasons", reason) + continue + } + if boost < 0.4 { + t.Errorf("critical reason %q has low boost %.2f, expected >= 0.4", reason, boost) + } + } + + // Test that normal events have low boosts + normalReasons := []string{ + "Pulled", + "Started", + "Created", + } + + for _, reason := range normalReasons { + boost, ok := SignificantK8sEventReasons[reason] + if !ok { + t.Errorf("normal reason %q not found in SignificantK8sEventReasons", reason) + continue + } + if boost > 0.2 { + t.Errorf("normal reason %q has high boost %.2f, expected <= 0.2", reason, boost) + } + } +} + +func TestScoreNormalization(t *testing.T) { + now := time.Now() + + // Create an event that would have very high score without normalization + event := &ChangeEventInfo{ + EventID: "1", + Timestamp: now.Add(-30 * time.Second), + EventType: "DELETE", + ConfigChanged: true, + StatusChanged: true, + Description: "Failed to pull image, config error, memory OOM", + } + + result := CalculateChangeEventSignificance( + event, + true, // on causal spine + now, + []string{"image", "config", "memory", "oom"}, + ) + + if result.Score > 1.0 { + t.Errorf("score %.2f exceeds 1.0, normalization failed", result.Score) + } + if result.Score < 0.0 { + t.Errorf("score %.2f is negative, normalization failed", result.Score) + } +} + +func TestK8sScoreNormalization(t *testing.T) { + now := time.Now() + + // Create a K8s event that would have very high score without normalization + event := &K8sEventInfo{ + EventID: "1", + Timestamp: now.Add(-30 * time.Second), + Reason: "CrashLoopBackOff", + Type: "Error", + Count: 100, + } + + result := CalculateK8sEventSignificance(event, true, now) + + if result.Score > 1.0 { + t.Errorf("score %.2f exceeds 1.0, normalization failed", result.Score) + } + if result.Score < 0.0 { + t.Errorf("score %.2f is negative, normalization failed", result.Score) + } +} diff --git a/internal/analysis/json_diff.go b/internal/analysis/json_diff.go new file mode 100644 index 0000000..433997e --- /dev/null +++ b/internal/analysis/json_diff.go @@ -0,0 +1,255 @@ +package analysis + +import ( + "encoding/json" + "reflect" + "sort" + "strings" +) + +// ComputeJSONDiff calculates the differences between two JSON byte slices. +// Returns a slice of EventDiff representing the changes from old to new. +func ComputeJSONDiff(oldData, newData []byte) ([]EventDiff, error) { + var oldObj, newObj map[string]any + + if len(oldData) > 0 { + if err := json.Unmarshal(oldData, &oldObj); err != nil { + return nil, err + } + } + if len(newData) > 0 { + if err := json.Unmarshal(newData, &newObj); err != nil { + return nil, err + } + } + + diffs := diffMaps("", oldObj, newObj) + + // Sort diffs by path for consistent ordering + sort.Slice(diffs, func(i, j int) bool { + return diffs[i].Path < diffs[j].Path + }) + + return diffs, nil +} + +// ParseJSONToMap parses JSON bytes into a map for use in FullSnapshot field. +func ParseJSONToMap(data []byte) (map[string]any, error) { + if len(data) == 0 { + return nil, nil + } + var obj map[string]any + if err := json.Unmarshal(data, &obj); err != nil { + return nil, err + } + return obj, nil +} + +// diffMaps recursively computes differences between two maps. +func diffMaps(prefix string, old, new map[string]any) []EventDiff { + var diffs []EventDiff + + // Track keys we've seen + seen := make(map[string]bool) + + // Check for removed/changed keys in old + for k, oldVal := range old { + seen[k] = true + path := joinPath(prefix, k) + newVal, exists := new[k] + + if !exists { + // Key removed + diffs = append(diffs, EventDiff{ + Path: path, + OldValue: simplifyValue(oldVal), + Op: "remove", + }) + } else if !deepEqual(oldVal, newVal) { + // Value changed - check if we should recurse + oldMap, oldIsMap := oldVal.(map[string]any) + newMap, newIsMap := newVal.(map[string]any) + + if oldIsMap && newIsMap { + // Recurse into nested objects + diffs = append(diffs, diffMaps(path, oldMap, newMap)...) + } else { + // Different types or non-objects + diffs = append(diffs, EventDiff{ + Path: path, + OldValue: simplifyValue(oldVal), + NewValue: simplifyValue(newVal), + Op: "replace", + }) + } + } + } + + // Check for added keys in new + for k, newVal := range new { + if !seen[k] { + path := joinPath(prefix, k) + diffs = append(diffs, EventDiff{ + Path: path, + NewValue: simplifyValue(newVal), + Op: "add", + }) + } + } + + return diffs +} + +// joinPath concatenates path segments with dot notation. +func joinPath(prefix, key string) string { + if prefix == "" { + return key + } + return prefix + "." + key +} + +// deepEqual checks if two values are deeply equal. +func deepEqual(a, b any) bool { + return reflect.DeepEqual(a, b) +} + +// simplifyValue converts complex values for diff output. +// For large arrays/objects, it may return a summary to keep diffs readable. +func simplifyValue(v any) any { + if v == nil { + return nil + } + + switch val := v.(type) { + case map[string]any: + // For large maps, summarize + if len(val) > 10 { + return map[string]any{ + "_type": "object", + "_keys": len(val), + "_value": val, + } + } + return val + case []any: + // For large arrays, summarize + if len(val) > 10 { + return map[string]any{ + "_type": "array", + "_length": len(val), + "_value": val, + } + } + return val + default: + return v + } +} + +// FilterNoisyPaths removes paths that are typically noisy and not useful for LLM analysis. +// This includes managed fields, resourceVersion, and other auto-generated fields. +func FilterNoisyPaths(diffs []EventDiff) []EventDiff { + noisyPrefixes := []string{ + "metadata.managedFields", + "metadata.resourceVersion", + "metadata.generation", + "metadata.uid", + "metadata.creationTimestamp", + "status.observedGeneration", + } + + var filtered []EventDiff + for _, d := range diffs { + isNoisy := false + for _, prefix := range noisyPrefixes { + if strings.HasPrefix(d.Path, prefix) { + isNoisy = true + break + } + } + if !isNoisy { + filtered = append(filtered, d) + } + } + return filtered +} + +// ConvertEventsToDiffFormat converts a slice of events from legacy format (Data field) +// to diff format (Diff/FullSnapshot fields). +// The first event (chronologically oldest) gets FullSnapshot, subsequent events get Diff from previous. +// IMPORTANT: Input events are assumed to be in REVERSE chronological order (newest first), +// so we reverse them before processing. +func ConvertEventsToDiffFormat(events []ChangeEventInfo, filterNoisy bool) []ChangeEventInfo { + if len(events) == 0 { + return events + } + + // Reverse events to get chronological order (oldest first) + reversed := make([]ChangeEventInfo, len(events)) + for i, event := range events { + reversed[len(events)-1-i] = event + } + + result := make([]ChangeEventInfo, len(events)) + var prevData []byte + + for i, event := range reversed { + result[i] = event + result[i].Data = nil // Clear legacy field + + if i == 0 { + // First event (chronologically oldest) gets full snapshot + snapshot, err := ParseJSONToMap(event.Data) + if err == nil && snapshot != nil { + result[i].FullSnapshot = snapshot + } + prevData = event.Data + } else { + // Subsequent events get diff from previous + diffs, err := ComputeJSONDiff(prevData, event.Data) + if err == nil { + if filterNoisy { + diffs = FilterNoisyPaths(diffs) + } + result[i].Diff = diffs + } + prevData = event.Data + } + } + + // Reverse back to original order (newest first) + final := make([]ChangeEventInfo, len(events)) + for i := range result { + final[len(result)-1-i] = result[i] + } + + return final +} + +// ConvertSingleEventToDiff converts a single event to diff format given previous data. +// Returns the modified event with Diff field populated. +func ConvertSingleEventToDiff(event *ChangeEventInfo, prevData []byte, filterNoisy bool) { + if event == nil { + return + } + + if len(prevData) == 0 { + // No previous data, this is effectively a CREATE + snapshot, err := ParseJSONToMap(event.Data) + if err == nil && snapshot != nil { + event.FullSnapshot = snapshot + } + } else { + // Compute diff from previous + diffs, err := ComputeJSONDiff(prevData, event.Data) + if err == nil { + if filterNoisy { + diffs = FilterNoisyPaths(diffs) + } + event.Diff = diffs + } + } + + // Clear legacy field in diff format + event.Data = nil +} diff --git a/internal/analysis/json_diff_test.go b/internal/analysis/json_diff_test.go new file mode 100644 index 0000000..a6fefde --- /dev/null +++ b/internal/analysis/json_diff_test.go @@ -0,0 +1,566 @@ +package analysis + +import ( + "testing" + "time" +) + +func TestComputeJSONDiff(t *testing.T) { + tests := []struct { + name string + oldJSON string + newJSON string + expected []EventDiff + wantErr bool + }{ + { + name: "simple field change", + oldJSON: `{"spec":{"replicas":1}}`, + newJSON: `{"spec":{"replicas":3}}`, + expected: []EventDiff{ + {Path: "spec.replicas", OldValue: float64(1), NewValue: float64(3), Op: "replace"}, + }, + }, + { + name: "field added", + oldJSON: `{"metadata":{"name":"test"}}`, + newJSON: `{"metadata":{"name":"test","namespace":"default"}}`, + expected: []EventDiff{ + {Path: "metadata.namespace", NewValue: "default", Op: "add"}, + }, + }, + { + name: "field removed", + oldJSON: `{"metadata":{"name":"test","labels":{"app":"web"}}}`, + newJSON: `{"metadata":{"name":"test"}}`, + expected: []EventDiff{ + {Path: "metadata.labels", OldValue: map[string]any{"app": "web"}, Op: "remove"}, + }, + }, + { + name: "nested object change", + oldJSON: `{"spec":{"template":{"spec":{"containers":[{"name":"app","image":"v1"}]}}}}`, + newJSON: `{"spec":{"template":{"spec":{"containers":[{"name":"app","image":"v2"}]}}}}`, + expected: []EventDiff{ + { + Path: "spec.template.spec.containers", + OldValue: []any{map[string]any{"name": "app", "image": "v1"}}, + NewValue: []any{map[string]any{"name": "app", "image": "v2"}}, + Op: "replace", + }, + }, + }, + { + name: "multiple changes", + oldJSON: `{"spec":{"replicas":1,"image":"v1"},"status":{"ready":true}}`, + newJSON: `{"spec":{"replicas":3,"image":"v2"},"status":{"ready":false}}`, + expected: []EventDiff{ + {Path: "spec.image", OldValue: "v1", NewValue: "v2", Op: "replace"}, + {Path: "spec.replicas", OldValue: float64(1), NewValue: float64(3), Op: "replace"}, + {Path: "status.ready", OldValue: true, NewValue: false, Op: "replace"}, + }, + }, + { + name: "spec changes", + oldJSON: `{"spec":{"values": {"a": "f", "x": "y"}}}`, + newJSON: `{"spec":{"values": {"x": "y"}}}`, + expected: []EventDiff{ + {Path: "spec.values.a", Op: "remove"}, + }, + }, + { + name: "empty old (new resource)", + oldJSON: ``, + newJSON: `{"metadata":{"name":"test"}}`, + expected: []EventDiff{ + {Path: "metadata", NewValue: map[string]any{"name": "test"}, Op: "add"}, + }, + }, + { + name: "empty new (deleted resource)", + oldJSON: `{"metadata":{"name":"test"}}`, + newJSON: ``, + expected: []EventDiff{ + {Path: "metadata", OldValue: map[string]any{"name": "test"}, Op: "remove"}, + }, + }, + { + name: "both empty", + oldJSON: ``, + newJSON: ``, + expected: nil, + }, + { + name: "identical objects", + oldJSON: `{"spec":{"replicas":3}}`, + newJSON: `{"spec":{"replicas":3}}`, + expected: nil, + }, + { + name: "type change", + oldJSON: `{"value":"string"}`, + newJSON: `{"value":123}`, + expected: []EventDiff{ + {Path: "value", OldValue: "string", NewValue: float64(123), Op: "replace"}, + }, + }, + { + name: "invalid old JSON", + oldJSON: `{invalid}`, + newJSON: `{"valid":true}`, + wantErr: true, + }, + { + name: "invalid new JSON", + oldJSON: `{"valid":true}`, + newJSON: `{invalid}`, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + diffs, err := ComputeJSONDiff([]byte(tt.oldJSON), []byte(tt.newJSON)) + + if tt.wantErr { + if err == nil { + t.Errorf("expected error but got none") + } + return + } + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if len(diffs) != len(tt.expected) { + t.Errorf("expected %d diffs, got %d: %+v", len(tt.expected), len(diffs), diffs) + return + } + + for i, expected := range tt.expected { + if diffs[i].Path != expected.Path { + t.Errorf("diff[%d].Path = %q, want %q", i, diffs[i].Path, expected.Path) + } + if diffs[i].Op != expected.Op { + t.Errorf("diff[%d].Op = %q, want %q", i, diffs[i].Op, expected.Op) + } + } + }) + } +} + +func TestParseJSONToMap(t *testing.T) { + tests := []struct { + name string + data []byte + wantNil bool + wantErr bool + }{ + { + name: "valid JSON", + data: []byte(`{"key":"value"}`), + wantNil: false, + wantErr: false, + }, + { + name: "empty data", + data: []byte{}, + wantNil: true, + wantErr: false, + }, + { + name: "nil data", + data: nil, + wantNil: true, + wantErr: false, + }, + { + name: "invalid JSON", + data: []byte(`{invalid}`), + wantNil: false, + wantErr: true, + }, + { + name: "nested object", + data: []byte(`{"spec":{"replicas":3,"template":{"containers":[]}}}`), + wantNil: false, + wantErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := ParseJSONToMap(tt.data) + + if tt.wantErr { + if err == nil { + t.Errorf("expected error but got none") + } + return + } + + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + + if tt.wantNil && result != nil { + t.Errorf("expected nil result, got %v", result) + } + if !tt.wantNil && result == nil { + t.Errorf("expected non-nil result, got nil") + } + }) + } +} + +func TestFilterNoisyPaths(t *testing.T) { + tests := []struct { + name string + diffs []EventDiff + expected int // number of diffs after filtering + }{ + { + name: "filter managedFields", + diffs: []EventDiff{ + {Path: "metadata.managedFields", Op: "replace"}, + {Path: "spec.replicas", Op: "replace"}, + }, + expected: 1, + }, + { + name: "filter resourceVersion", + diffs: []EventDiff{ + {Path: "metadata.resourceVersion", Op: "replace"}, + {Path: "spec.image", Op: "replace"}, + }, + expected: 1, + }, + { + name: "filter multiple noisy paths", + diffs: []EventDiff{ + {Path: "metadata.managedFields.0", Op: "replace"}, + {Path: "metadata.generation", Op: "replace"}, + {Path: "metadata.uid", Op: "add"}, + {Path: "metadata.creationTimestamp", Op: "add"}, + {Path: "status.observedGeneration", Op: "replace"}, + {Path: "spec.replicas", Op: "replace"}, + {Path: "spec.template", Op: "replace"}, + }, + expected: 2, + }, + { + name: "no noisy paths", + diffs: []EventDiff{ + {Path: "spec.replicas", Op: "replace"}, + {Path: "spec.image", Op: "replace"}, + {Path: "metadata.labels", Op: "add"}, + }, + expected: 3, + }, + { + name: "empty diffs", + diffs: []EventDiff{}, + expected: 0, + }, + { + name: "nil diffs", + diffs: nil, + expected: 0, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := FilterNoisyPaths(tt.diffs) + if len(result) != tt.expected { + t.Errorf("expected %d diffs after filtering, got %d: %+v", tt.expected, len(result), result) + } + }) + } +} + +func TestConvertEventsToDiffFormat(t *testing.T) { + tests := []struct { + name string + events []ChangeEventInfo + filterNoisy bool + checkFirst func(t *testing.T, event ChangeEventInfo) + checkSecond func(t *testing.T, event ChangeEventInfo) + }{ + { + name: "converts first event to fullSnapshot", + events: []ChangeEventInfo{ + { + EventID: "1", + Timestamp: time.Now(), + EventType: "CREATE", + Data: []byte(`{"metadata":{"name":"test"},"spec":{"replicas":1}}`), + }, + }, + filterNoisy: false, + checkFirst: func(t *testing.T, event ChangeEventInfo) { + if event.FullSnapshot == nil { + t.Error("expected FullSnapshot to be set for first event") + } + if event.Data != nil { + t.Error("expected Data to be cleared") + } + if event.Diff != nil { + t.Error("expected Diff to be nil for first event") + } + }, + }, + { + name: "converts subsequent events to diff", + // Events in REVERSE chronological order (newest first) + events: []ChangeEventInfo{ + { + EventID: "2", + Timestamp: time.Now().Add(time.Minute), + EventType: "UPDATE", + Data: []byte(`{"spec":{"replicas":3}}`), + }, + { + EventID: "1", + Timestamp: time.Now(), + EventType: "CREATE", + Data: []byte(`{"spec":{"replicas":1}}`), + }, + }, + filterNoisy: false, + checkFirst: func(t *testing.T, event ChangeEventInfo) { + // First in output (newest) should have Diff + if event.Diff == nil { + t.Error("expected Diff to be set for first event (newest)") + } + if len(event.Diff) != 1 { + t.Errorf("expected 1 diff, got %d", len(event.Diff)) + } + if event.FullSnapshot != nil { + t.Error("expected FullSnapshot to be nil for newest event") + } + if event.Data != nil { + t.Error("expected Data to be cleared") + } + }, + checkSecond: func(t *testing.T, event ChangeEventInfo) { + // Second in output (oldest) should have FullSnapshot + if event.FullSnapshot == nil { + t.Error("expected FullSnapshot to be set for oldest event") + } + if event.Diff != nil { + t.Error("expected Diff to be nil for oldest event") + } + if event.Data != nil { + t.Error("expected Data to be cleared") + } + }, + }, + { + name: "filters noisy paths when enabled", + // Events in REVERSE chronological order (newest first) + events: []ChangeEventInfo{ + { + EventID: "2", + Timestamp: time.Now().Add(time.Minute), + EventType: "UPDATE", + Data: []byte(`{"metadata":{"resourceVersion":"2"},"spec":{"replicas":3}}`), + }, + { + EventID: "1", + Timestamp: time.Now(), + EventType: "CREATE", + Data: []byte(`{"metadata":{"resourceVersion":"1"},"spec":{"replicas":1}}`), + }, + }, + filterNoisy: true, + checkFirst: func(t *testing.T, event ChangeEventInfo) { + // First event (newest) should have diff with noisy paths filtered + for _, d := range event.Diff { + if d.Path == "metadata.resourceVersion" { + t.Error("expected metadata.resourceVersion to be filtered out") + } + } + }, + }, + { + name: "handles empty events", + events: []ChangeEventInfo{}, + filterNoisy: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ConvertEventsToDiffFormat(tt.events, tt.filterNoisy) + + if len(result) != len(tt.events) { + t.Errorf("expected %d events, got %d", len(tt.events), len(result)) + return + } + + if len(result) > 0 && tt.checkFirst != nil { + tt.checkFirst(t, result[0]) + } + if len(result) > 1 && tt.checkSecond != nil { + tt.checkSecond(t, result[1]) + } + }) + } +} + +func TestConvertSingleEventToDiff(t *testing.T) { + tests := []struct { + name string + event *ChangeEventInfo + prevData []byte + filterNoisy bool + check func(t *testing.T, event *ChangeEventInfo) + }{ + { + name: "no previous data creates fullSnapshot", + event: &ChangeEventInfo{ + EventID: "1", + Data: []byte(`{"spec":{"replicas":1}}`), + }, + prevData: nil, + filterNoisy: false, + check: func(t *testing.T, event *ChangeEventInfo) { + if event.FullSnapshot == nil { + t.Error("expected FullSnapshot to be set") + } + if event.Diff != nil { + t.Error("expected Diff to be nil") + } + if event.Data != nil { + t.Error("expected Data to be cleared") + } + }, + }, + { + name: "with previous data creates diff", + event: &ChangeEventInfo{ + EventID: "2", + Data: []byte(`{"spec":{"replicas":3}}`), + }, + prevData: []byte(`{"spec":{"replicas":1}}`), + filterNoisy: false, + check: func(t *testing.T, event *ChangeEventInfo) { + if event.Diff == nil { + t.Error("expected Diff to be set") + } + if event.FullSnapshot != nil { + t.Error("expected FullSnapshot to be nil") + } + if event.Data != nil { + t.Error("expected Data to be cleared") + } + }, + }, + { + name: "nil event does not panic", + event: nil, + prevData: nil, + filterNoisy: false, + check: func(t *testing.T, event *ChangeEventInfo) {}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ConvertSingleEventToDiff(tt.event, tt.prevData, tt.filterNoisy) + if tt.event != nil { + tt.check(t, tt.event) + } + }) + } +} + +func TestJoinPath(t *testing.T) { + tests := []struct { + prefix string + key string + expected string + }{ + {"", "key", "key"}, + {"prefix", "key", "prefix.key"}, + {"a.b", "c", "a.b.c"}, + {"spec", "replicas", "spec.replicas"}, + } + + for _, tt := range tests { + t.Run(tt.prefix+"_"+tt.key, func(t *testing.T) { + result := joinPath(tt.prefix, tt.key) + if result != tt.expected { + t.Errorf("joinPath(%q, %q) = %q, want %q", tt.prefix, tt.key, result, tt.expected) + } + }) + } +} + +func TestSimplifyValue(t *testing.T) { + tests := []struct { + name string + input any + isLarge bool // if true, expect summarized output + }{ + { + name: "nil value", + input: nil, + isLarge: false, + }, + { + name: "string value", + input: "test", + isLarge: false, + }, + { + name: "number value", + input: float64(42), + isLarge: false, + }, + { + name: "small map", + input: map[string]any{"a": 1, "b": 2}, + isLarge: false, + }, + { + name: "large map (>10 keys)", + input: map[string]any{ + "a": 1, "b": 2, "c": 3, "d": 4, "e": 5, + "f": 6, "g": 7, "h": 8, "i": 9, "j": 10, "k": 11, + }, + isLarge: true, + }, + { + name: "small array", + input: []any{1, 2, 3}, + isLarge: false, + }, + { + name: "large array (>10 elements)", + input: []any{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, + isLarge: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := simplifyValue(tt.input) + + if tt.isLarge { + resultMap, ok := result.(map[string]any) + if !ok { + t.Errorf("expected large value to be summarized as map, got %T", result) + return + } + if _, hasType := resultMap["_type"]; !hasType { + t.Error("expected summarized value to have _type field") + } + } + }) + } +} diff --git a/internal/analysis/types.go b/internal/analysis/types.go index 7dbefba..7e7b399 100644 --- a/internal/analysis/types.go +++ b/internal/analysis/types.go @@ -72,6 +72,21 @@ type CausalGraph struct { Edges []GraphEdge `json:"edges"` } +// EventSignificance provides importance scoring for individual events +// to guide LLM attention toward high-impact changes +type EventSignificance struct { + Score float64 `json:"score"` // 0.0 to 1.0 + Reasons []string `json:"reasons"` // Human-readable explanation of significance +} + +// EventDiff represents a single change between consecutive events +type EventDiff struct { + Path string `json:"path"` // JSON path, e.g., "spec.replicas" + OldValue any `json:"old,omitempty"` // Previous value (nil for additions) + NewValue any `json:"new,omitempty"` // New value (nil for removals) + Op string `json:"op"` // "add", "remove", "replace" +} + // ChangeEventInfo represents a change event in the causal chain type ChangeEventInfo struct { EventID string `json:"eventId"` @@ -81,7 +96,16 @@ type ChangeEventInfo struct { ConfigChanged bool `json:"configChanged,omitempty"` StatusChanged bool `json:"statusChanged,omitempty"` Description string `json:"description,omitempty"` // Human-readable summary - Data []byte `json:"data,omitempty"` // Full resource JSON for diff + + // Significance scoring for LLM prioritization + Significance *EventSignificance `json:"significance,omitempty"` + + // Diff-based format (new) - mutually exclusive with Data + Diff []EventDiff `json:"diff,omitempty"` // Changes from previous event + FullSnapshot map[string]any `json:"fullSnapshot,omitempty"` // Only for first event per resource + + // Legacy format - full resource JSON (deprecated, for backward compat) + Data []byte `json:"data,omitempty"` } // K8sEventInfo represents a Kubernetes Event (kind: Event) related to a resource @@ -93,6 +117,9 @@ type K8sEventInfo struct { Type string `json:"type"` // "Warning", "Normal", "Error" Count int `json:"count"` // How many times this event occurred Source string `json:"source"` // Component that generated the event + + // Significance scoring for LLM prioritization + Significance *EventSignificance `json:"significance,omitempty"` } // RootCauseHypothesis identifies the most likely root cause diff --git a/internal/api/handlers/root_cause_handler.go b/internal/api/handlers/root_cause_handler.go index 892291e..3062795 100644 --- a/internal/api/handlers/root_cause_handler.go +++ b/internal/api/handlers/root_cause_handler.go @@ -59,6 +59,7 @@ func (h *RootCauseHandler) Handle(w http.ResponseWriter, r *http.Request) { attribute.String("resource_uid", input.ResourceUID), attribute.Int64("failure_timestamp", input.FailureTimestamp), attribute.Int("max_depth", input.MaxDepth), + attribute.String("format", string(input.Format)), ) } @@ -169,12 +170,26 @@ func (h *RootCauseHandler) parseInput(r *http.Request) (analysis.AnalyzeInput, e lookbackNs = lookbackMs * 1_000_000 } + // Optional: format (default "diff" for new format with significance) + format := analysis.FormatDiff + if formatStr := query.Get("format"); formatStr != "" { + switch formatStr { + case "legacy": + format = analysis.FormatLegacy + case "diff": + format = analysis.FormatDiff + default: + return analysis.AnalyzeInput{}, api.NewValidationError("invalid format: must be 'legacy' or 'diff'") + } + } + return analysis.AnalyzeInput{ ResourceUID: resourceUID, FailureTimestamp: failureTimestamp, LookbackNs: lookbackNs, MaxDepth: maxDepth, MinConfidence: minConfidence, + Format: format, }, nil } diff --git a/ui/src/components/RootCauseView.tsx b/ui/src/components/RootCauseView.tsx index 4956173..c57f1a8 100644 --- a/ui/src/components/RootCauseView.tsx +++ b/ui/src/components/RootCauseView.tsx @@ -1,5 +1,5 @@ import React, { useEffect, useRef, useState, useMemo } from 'react'; -import { RootCauseAnalysisV2, GraphNode, ChangeEventInfo, K8sEventInfo } from '../types/rootCause'; +import { RootCauseAnalysisV2, GraphNode, ChangeEventInfo, K8sEventInfo, EventSignificance } from '../types/rootCause'; import * as d3 from 'd3'; import { useSettings } from '../hooks/useSettings'; import { diffJsonWithContext, DiffLine, detectChangeCategories, ChangeCategory } from '../utils/jsonDiff'; @@ -57,6 +57,41 @@ const DiffLineView = ({ line }: { line: DiffLine }) => { ); }; +// Significance badge component for displaying event importance +const SignificanceBadge = ({ significance }: { significance?: EventSignificance }) => { + if (!significance) return null; + + const score = significance.score; + const percentage = Math.round(score * 100); + + // Determine color based on significance level + const getColorClasses = () => { + if (score >= 0.7) return 'bg-red-500/20 text-red-400 border-red-500/30'; + if (score >= 0.4) return 'bg-amber-500/20 text-amber-400 border-amber-500/30'; + return 'bg-gray-500/20 text-gray-400 border-gray-500/30'; + }; + + const getLabel = () => { + if (score >= 0.7) return 'High'; + if (score >= 0.4) return 'Med'; + return 'Low'; + }; + + // Show only the first reason as a tooltip hint + const tooltip = significance.reasons.length > 0 + ? significance.reasons.join(', ') + : `Significance: ${percentage}%`; + + return ( +
+ {getLabel()} {percentage}% +
+ ); +}; + export const RootCauseView: React.FC = ({ analysis, initialResourceUID, @@ -403,6 +438,127 @@ export const RootCauseView: React.FC = ({ }; }, [isResizing]); + // Helper to convert pre-computed EventDiff[] to DiffLine[] + const convertEventDiffToDiffLines = (diffs: Array<{ path: string; old?: unknown; new?: unknown; op: string }>): DiffLine[] => { + const lines: DiffLine[] = []; + + for (const d of diffs) { + if (d.op === 'remove' || d.op === 'replace') { + lines.push({ + type: 'remove', + content: `${d.path}: ${JSON.stringify(d.old)}`, + }); + } + if (d.op === 'add' || d.op === 'replace') { + lines.push({ + type: 'add', + content: `${d.path}: ${JSON.stringify(d.new)}`, + }); + } + } + + return lines; + }; + + // Helper to convert object to DiffLine[] as context (neutral color) + const convertObjectToContextLines = (obj: Record): DiffLine[] => { + const lines: DiffLine[] = []; + const jsonStr = JSON.stringify(obj, null, 2); + const jsonLines = jsonStr.split('\n'); + + for (const line of jsonLines) { + lines.push({ + type: 'context', + content: line, + }); + } + + return lines; + }; + + // Helper to convert fullSnapshot to DiffLine[] (show all fields as additions) + const convertSnapshotToDiffLines = (snapshot: Record): DiffLine[] => { + const lines: DiffLine[] = []; + const jsonStr = JSON.stringify(snapshot, null, 2); + const jsonLines = jsonStr.split('\n'); + + for (const line of jsonLines) { + lines.push({ + type: 'add', + content: line, + }); + } + + return lines; + }; + + // Helper to apply diffs to reconstruct full resource state + const applyDiffsToSnapshot = ( + snapshot: Record, + diffs: Array<{ path: string; old?: unknown; new?: unknown; op: string }> + ): Record => { + // Deep clone the snapshot + const result = JSON.parse(JSON.stringify(snapshot)); + + for (const diff of diffs) { + const pathParts = diff.path.split('.'); + let current: Record = result; + + // Navigate to parent of the target + for (let i = 0; i < pathParts.length - 1; i++) { + const part = pathParts[i]; + if (current[part] === undefined) { + current[part] = {}; + } + current = current[part] as Record; + } + + const lastPart = pathParts[pathParts.length - 1]; + + switch (diff.op) { + case 'add': + case 'replace': + current[lastPart] = diff.new; + break; + case 'remove': + delete current[lastPart]; + break; + } + } + + return result; + }; + + // Helper to reconstruct full resource at a given event index + const reconstructFullResource = (eventIndex: number): Record | null => { + if (!filteredTimeline) return null; + + // Find the first change event with fullSnapshot + let snapshot: Record | null = null; + let snapshotIndex = -1; + + for (let i = 0; i <= eventIndex; i++) { + const item = filteredTimeline[i]; + if (item.type === 'change' && item.changeEvent?.fullSnapshot) { + snapshot = JSON.parse(JSON.stringify(item.changeEvent.fullSnapshot)); + snapshotIndex = i; + break; + } + } + + if (!snapshot) return null; + + // Apply all diffs from after the snapshot up to and including the current event + for (let i = snapshotIndex + 1; i <= eventIndex; i++) { + const item = filteredTimeline[i]; + if (item.type === 'change' && item.changeEvent?.diff) { + snapshot = applyDiffsToSnapshot(snapshot, item.changeEvent.diff); + } + } + + return snapshot; + }; + // Calculate JSON diff for selected event const eventDiff = useMemo(() => { if (!filteredTimeline || filteredTimeline.length === 0) { @@ -412,7 +568,39 @@ export const RootCauseView: React.FC = ({ const selectedItem = filteredTimeline[selectedEventIndex]; // Only calculate diff for change events, not K8s events - if (!selectedItem || selectedItem.type !== 'change' || !selectedItem.changeEvent?.data) { + if (!selectedItem || selectedItem.type !== 'change' || !selectedItem.changeEvent) { + return null; + } + + const changeEvent = selectedItem.changeEvent; + + // New format handling + const hasNewFormat = changeEvent.diff !== undefined || changeEvent.fullSnapshot !== undefined; + + if (hasNewFormat) { + // When "Show Full Resource" is enabled, reconstruct and show the full resource + if (showFullDiff) { + const fullResource = reconstructFullResource(selectedEventIndex); + if (fullResource) { + return convertObjectToContextLines(fullResource); + } + } + + // Show pre-computed diff + if (changeEvent.diff && changeEvent.diff.length > 0) { + return convertEventDiffToDiffLines(changeEvent.diff); + } + + // First event has fullSnapshot - show as additions + if (changeEvent.fullSnapshot) { + return convertSnapshotToDiffLines(changeEvent.fullSnapshot); + } + + return null; + } + + // Legacy format: compute diff from base64-encoded data field + if (!changeEvent.data) { return null; } @@ -426,11 +614,8 @@ export const RootCauseView: React.FC = ({ } // Parse JSON data (handle base64 encoding) - let currentData = null; - let previousData = null; - - currentData = parseEventData(selectedItem.changeEvent.data); - previousData = parseEventData(previousChangeEvent?.data); + const currentData = parseEventData(changeEvent.data); + const previousData = parseEventData(previousChangeEvent?.data); // Calculate diff if (!currentData) { @@ -1277,10 +1462,10 @@ export const RootCauseView: React.FC = ({ - {/* Split panel: diff (left 70%) and event list (right 30%) */} + {/* Split panel: diff (left 65%) and event list (right 35%) */}
{/* Left: Diff view */} -
+
{/* Toolbar */}
@@ -1290,7 +1475,7 @@ export const RootCauseView: React.FC = ({ onClick={() => setShowFullDiff(!showFullDiff)} className="text-[10px] px-2 py-1 rounded bg-[var(--color-surface-active)] hover:bg-[var(--color-surface-hover)] text-[var(--color-text-primary)] transition-colors" > - {showFullDiff ? 'Show Changes Only' : 'Show Full Diff'} + {showFullDiff ? 'Show Changes Only' : 'Show Full Resource'}
@@ -1307,7 +1492,18 @@ export const RootCauseView: React.FC = ({
) : filteredTimeline[selectedEventIndex].type === 'change' ? (
- {selectedEventIndex === 0 ? 'First event - no previous state to compare' : 'No data available for diff'} + {(() => { + const evt = filteredTimeline[selectedEventIndex].changeEvent; + // Check if it's an empty diff (no changes) + if (evt?.diff && evt.diff.length === 0) { + return 'No changes detected in this event'; + } + // First event without snapshot + if (selectedEventIndex === 0) { + return 'First event - no previous state to compare'; + } + return 'No data available for diff'; + })()}
) : null} @@ -1348,8 +1544,8 @@ export const RootCauseView: React.FC = ({
- {/* Right: Event timeline list */} -
+ {/* Right: Event timeline list (fixed 35% width) */} +
Event Timeline @@ -1402,12 +1598,15 @@ export const RootCauseView: React.FC = ({ {item.type === 'change' && item.changeEvent ? ( <>
-
- {item.changeEvent.eventType} +
+
+ {item.changeEvent.eventType} +
+
{formatTime(item.timestamp)} @@ -1450,6 +1649,7 @@ export const RootCauseView: React.FC = ({ }`}> {item.k8sEvent.type}
+
{formatTime(item.timestamp)} diff --git a/ui/src/services/rootCauseService.ts b/ui/src/services/rootCauseService.ts index 4a346c4..ab9bf7e 100644 --- a/ui/src/services/rootCauseService.ts +++ b/ui/src/services/rootCauseService.ts @@ -15,21 +15,25 @@ interface CacheEntry { const cache = new Map(); -function getCacheKey(resourceUID: string, timestamp: Date, lookbackMs?: number): string { +function getCacheKey(resourceUID: string, timestamp: Date, lookbackMs?: number, format?: ResponseFormat): string { // Round to nearest second to improve cache hit rate const timestampSec = Math.floor(timestamp.getTime() / 1000); const lookbackKey = lookbackMs ? `:${lookbackMs}` : ''; - return `${resourceUID}:${timestampSec}${lookbackKey}`; + const formatKey = format ? `:${format}` : ':diff'; + return `${resourceUID}:${timestampSec}${lookbackKey}${formatKey}`; } function isCacheValid(entry: CacheEntry): boolean { return Date.now() - entry.timestamp < CACHE_TTL_MS; } +export type ResponseFormat = 'legacy' | 'diff'; + export interface RootCauseOptions { maxDepth?: number; minConfidence?: number; lookbackMs?: number; // Lookback in milliseconds + format?: ResponseFormat; // Response format: 'legacy' or 'diff' (default: 'diff') } /** @@ -41,7 +45,7 @@ export async function fetchRootCauseAnalysis( options?: RootCauseOptions ): Promise { // Check cache first - const cacheKey = getCacheKey(resourceUID, failureTimestamp, options?.lookbackMs); + const cacheKey = getCacheKey(resourceUID, failureTimestamp, options?.lookbackMs, options?.format); const cached = cache.get(cacheKey); if (cached && isCacheValid(cached)) { @@ -67,6 +71,9 @@ export async function fetchRootCauseAnalysis( params.set('lookbackMs', options.lookbackMs.toString()); } + // Default to 'diff' format for new format with significance scoring + params.set('format', options?.format || 'diff'); + // Fetch from API with retry logic for network failures const url = `/v1/root-cause?${params.toString()}`; console.log('[RootCause] Fetching:', url); diff --git a/ui/src/types/rootCause.ts b/ui/src/types/rootCause.ts index 5e5dccf..b3da69f 100644 --- a/ui/src/types/rootCause.ts +++ b/ui/src/types/rootCause.ts @@ -10,6 +10,24 @@ export interface SymptomResource { name: string; } +/** + * Event significance scoring for LLM prioritization + */ +export interface EventSignificance { + score: number; // 0.0 to 1.0 + reasons: string[]; // Human-readable explanation of significance +} + +/** + * Represents a single change in the diff-based format + */ +export interface EventDiff { + path: string; // JSON path, e.g., "spec.replicas" + old?: unknown; // Previous value (undefined for additions) + new?: unknown; // New value (undefined for removals) + op: 'add' | 'remove' | 'replace'; +} + export interface ChangeEventInfo { eventId: string; timestamp: string; // ISO timestamp @@ -18,7 +36,16 @@ export interface ChangeEventInfo { configChanged?: boolean; statusChanged?: boolean; description?: string; - data?: string; // Full resource JSON for diff calculation + + // Significance scoring for LLM prioritization + significance?: EventSignificance; + + // Diff-based format (new) - mutually exclusive with data + diff?: EventDiff[]; // Changes from previous event + fullSnapshot?: Record; // Only for first event per resource + + // Legacy format - full resource JSON (deprecated) + data?: string; } export interface K8sEventInfo { @@ -29,6 +56,9 @@ export interface K8sEventInfo { type: string; // "Warning", "Normal", "Error" count: number; // How many times this event occurred source: string; // Component that generated the event + + // Significance scoring for LLM prioritization + significance?: EventSignificance; } export interface GraphNode { From 46b679754dae9d7eef2a9fe197c49a6d681bbe5a Mon Sep 17 00:00:00 2001 From: Moritz Johner Date: Tue, 6 Jan 2026 09:02:08 +0100 Subject: [PATCH 3/3] fix: make metadata cache configurable via cli flag Signed-off-by: Moritz Johner --- chart/templates/deployment.yaml | 3 +++ chart/values.yaml | 7 +++++++ cmd/spectre/commands/server.go | 7 +++++++ internal/apiserver/server.go | 9 +++++---- tests/e2e/default_resources_stage_test.go | 6 ++++++ tests/e2e/fixtures/helm-values-test.yaml | 5 +++++ tests/e2e/ui/ui_stage_test.go | 8 ++++++-- 7 files changed, 39 insertions(+), 6 deletions(-) diff --git a/chart/templates/deployment.yaml b/chart/templates/deployment.yaml index ec3c961..8f1a25d 100644 --- a/chart/templates/deployment.yaml +++ b/chart/templates/deployment.yaml @@ -116,6 +116,9 @@ spec: - --graph-rebuild-if-empty={{ .Values.graph.sync.rebuildIfEmptyOnly }} - --graph-rebuild-window-hours={{ .Values.graph.sync.rebuildWindowHours }} {{- end }} + {{- if .Values.metadataCache }} + - --metadata-cache-refresh-seconds={{ .Values.metadataCache.refreshSeconds }} + {{- end }} {{- range .Values.extraArgs }} - {{ . }} {{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index 6e13137..db4223c 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -193,6 +193,13 @@ graph: # Batch timeout (in seconds) batchTimeoutSeconds: 5 +# Metadata cache configuration +metadataCache: + # How often to refresh the metadata cache (in seconds) + # Lower values mean more up-to-date metadata but more database queries + # Higher values reduce database load but may show stale data + refreshSeconds: 30 + # Persistent storage configuration (deprecated - storage package removed) persistence: enabled: false diff --git a/cmd/spectre/commands/server.go b/cmd/spectre/commands/server.go index 6e93e4e..ae8a255 100644 --- a/cmd/spectre/commands/server.go +++ b/cmd/spectre/commands/server.go @@ -51,6 +51,8 @@ var ( graphRebuildWindowHours int // Audit log flag auditLogPath string + // Metadata cache configuration + metadataCacheRefreshSeconds int ) var serverCmd = &cobra.Command{ @@ -91,6 +93,10 @@ func init() { serverCmd.Flags().StringVar(&auditLogPath, "audit-log", "", "Path to write event audit log (JSONL format) for test fixtures. "+ "If empty, audit logging is disabled.") + + // Metadata cache configuration + serverCmd.Flags().IntVar(&metadataCacheRefreshSeconds, "metadata-cache-refresh-seconds", 30, + "Metadata cache refresh period in seconds (default: 30)") } func runServer(cmd *cobra.Command, args []string) { @@ -297,6 +303,7 @@ func runServer(cmd *cobra.Command, args []string) { readinessChecker, false, // No demo mode tracingProvider, + time.Duration(metadataCacheRefreshSeconds)*time.Second, ) logger.Info("API server component created (graph-only)") diff --git a/internal/apiserver/server.go b/internal/apiserver/server.go index 106e706..0894131 100644 --- a/internal/apiserver/server.go +++ b/internal/apiserver/server.go @@ -61,7 +61,7 @@ func NewWithStorageAndGraph( IsEnabled() bool }, ) *Server { - return NewWithStorageGraphAndPipeline(port, storageExecutor, graphExecutor, querySource, storage, graphClient, nil, readinessChecker, demoMode, tracingProvider) + return NewWithStorageGraphAndPipeline(port, storageExecutor, graphExecutor, querySource, storage, graphClient, nil, readinessChecker, demoMode, tracingProvider, 30*time.Second) } // NewWithStorageGraphAndPipeline creates a new API server with graph query executor and pipeline support @@ -79,6 +79,7 @@ func NewWithStorageGraphAndPipeline( GetTracer(string) trace.Tracer IsEnabled() bool }, + metadataRefreshPeriod time.Duration, // How often to refresh the metadata cache ) *Server { s := &Server{ port: port, @@ -103,9 +104,9 @@ func NewWithStorageGraphAndPipeline( } if metadataExecutor != nil { - // Create cache with 30-second refresh period - s.metadataCache = api.NewMetadataCache(metadataExecutor, s.logger, 30*time.Second) - s.logger.Info("Metadata cache created (will initialize on server start)") + // Create cache with configurable refresh period + s.metadataCache = api.NewMetadataCache(metadataExecutor, s.logger, metadataRefreshPeriod) + s.logger.Info("Metadata cache created with refresh period %v (will initialize on server start)", metadataRefreshPeriod) } // Register all routes and handlers diff --git a/tests/e2e/default_resources_stage_test.go b/tests/e2e/default_resources_stage_test.go index 893f39d..9aac557 100644 --- a/tests/e2e/default_resources_stage_test.go +++ b/tests/e2e/default_resources_stage_test.go @@ -149,6 +149,12 @@ func (s *DefaultResourcesStage) metadata_contains_expected_data() *DefaultResour ctx, cancel := s.ctxHelper.WithDefaultTimeout() defer cancel() + // Wait for metadata cache to refresh + // The cache refreshes every 2 seconds in test configuration + // Wait 3 seconds to ensure at least one refresh has occurred + s.T.Log("Waiting 3s for metadata cache to refresh...") + time.Sleep(3 * time.Second) + metadata, err := s.APIClient.GetMetadata(ctx, nil, nil) s.Require.NoError(err) diff --git a/tests/e2e/fixtures/helm-values-test.yaml b/tests/e2e/fixtures/helm-values-test.yaml index f5ead71..20e2623 100644 --- a/tests/e2e/fixtures/helm-values-test.yaml +++ b/tests/e2e/fixtures/helm-values-test.yaml @@ -172,3 +172,8 @@ graph: # Use empty storageClassName for immediate binding in test environments storageClassName: "" +# Metadata cache configuration for tests +# Use a very short refresh period to avoid race conditions in E2E tests +metadataCache: + refreshSeconds: 2 + diff --git a/tests/e2e/ui/ui_stage_test.go b/tests/e2e/ui/ui_stage_test.go index 7b03a50..5c5e8da 100644 --- a/tests/e2e/ui/ui_stage_test.go +++ b/tests/e2e/ui/ui_stage_test.go @@ -143,8 +143,12 @@ func (s *UIStage) resources_are_available() *UIStage { for _, dep := range s.deployments { helpers.EventuallyResourceCreated(s.t, s.apiClient, dep.Namespace, "Deployment", dep.Name, helpers.DefaultEventuallyOption) } - // Wait additional time for storage to fully index and UI to update - // This is needed for UI rendering, not backend indexing (different from removed backend sleeps) + // Wait additional time for: + // 1. Storage to fully index + // 2. Metadata cache to refresh (refreshes every 2s in test config) + // 3. UI to update + // Using 5 seconds to ensure at least 2 cache refresh cycles have occurred + s.t.Log("Waiting 5s for resources to be indexed and metadata cache to refresh...") time.Sleep(5 * time.Second) return s }