Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions chart/templates/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ spec:
- --graph-rebuild-if-empty={{ .Values.graph.sync.rebuildIfEmptyOnly }}
- --graph-rebuild-window-hours={{ .Values.graph.sync.rebuildWindowHours }}
{{- end }}
{{- if .Values.metadataCache }}
- --metadata-cache-refresh-seconds={{ .Values.metadataCache.refreshSeconds }}
{{- end }}
{{- range .Values.extraArgs }}
- {{ . }}
{{- end }}
Expand Down
7 changes: 7 additions & 0 deletions chart/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,13 @@ graph:
# Batch timeout (in seconds)
batchTimeoutSeconds: 5

# Metadata cache configuration
metadataCache:
# How often to refresh the metadata cache (in seconds)
# Lower values mean more up-to-date metadata but more database queries
# Higher values reduce database load but may show stale data
refreshSeconds: 30

# Persistent storage configuration (deprecated - storage package removed)
persistence:
enabled: false
Expand Down
7 changes: 7 additions & 0 deletions cmd/spectre/commands/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ var (
graphRebuildWindowHours int
// Audit log flag
auditLogPath string
// Metadata cache configuration
metadataCacheRefreshSeconds int
)

var serverCmd = &cobra.Command{
Expand Down Expand Up @@ -91,6 +93,10 @@ func init() {
serverCmd.Flags().StringVar(&auditLogPath, "audit-log", "",
"Path to write event audit log (JSONL format) for test fixtures. "+
"If empty, audit logging is disabled.")

// Metadata cache configuration
serverCmd.Flags().IntVar(&metadataCacheRefreshSeconds, "metadata-cache-refresh-seconds", 30,
"Metadata cache refresh period in seconds (default: 30)")
}

func runServer(cmd *cobra.Command, args []string) {
Expand Down Expand Up @@ -297,6 +303,7 @@ func runServer(cmd *cobra.Command, args []string) {
readinessChecker,
false, // No demo mode
tracingProvider,
time.Duration(metadataCacheRefreshSeconds)*time.Second,
)
logger.Info("API server component created (graph-only)")

Expand Down
55 changes: 55 additions & 0 deletions internal/analysis/analyzer.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,24 @@ func NewRootCauseAnalyzer(graphClient graph.Client) *RootCauseAnalyzer {
}
}

// ResponseFormat specifies the format for the API response
type ResponseFormat string

const (
// FormatLegacy uses base64-encoded full resource data (backward compatible)
FormatLegacy ResponseFormat = "legacy"
// FormatDiff uses diff-based format with significance scoring
FormatDiff ResponseFormat = "diff"
)

// AnalyzeInput defines input parameters for root cause analysis
type AnalyzeInput struct {
ResourceUID string
FailureTimestamp int64 // Unix nanoseconds
LookbackNs int64 // Lookback window in nanoseconds (default: 10 minutes)
MaxDepth int
MinConfidence float64
Format ResponseFormat // Response format: "legacy" or "diff" (default: "diff")
}

// Analyze performs root cause analysis using the causality-first approach.
Expand Down Expand Up @@ -165,6 +176,29 @@ func (a *RootCauseAnalyzer) Analyze(ctx context.Context, input AnalyzeInput) (*R
a.logger.Info("Analysis completed in %v - degraded=%v, symptom_only=%v, confidence=%.2f",
totalDuration, quality.IsDegraded, quality.IsSymptomOnly, confidence.Score)

// Apply format-specific transformations
format := input.Format
if format == "" {
format = FormatDiff // Default to new format
}

if format == FormatDiff {
a.logger.Debug("Applying diff format transformations")
// Extract error patterns from symptom for correlation
errorPatterns := ExtractErrorPatterns(symptom.ErrorMessage)

// Apply significance scoring and diff conversion to all nodes
a.applyDiffFormat(&graph, time.Unix(0, input.FailureTimestamp), errorPatterns)

// Also process root cause event
if rootCause.ChangeEvent.Data != nil {
rootCause.ChangeEvent.Significance = CalculateChangeEventSignificance(
&rootCause.ChangeEvent, true, time.Unix(0, input.FailureTimestamp), errorPatterns,
)
ConvertSingleEventToDiff(&rootCause.ChangeEvent, nil, true)
}
}

return &RootCauseAnalysisV2{
Incident: IncidentAnalysis{
ObservedSymptom: *symptom,
Expand All @@ -183,3 +217,24 @@ func (a *RootCauseAnalyzer) Analyze(ctx context.Context, input AnalyzeInput) (*R
},
}, nil
}

// applyDiffFormat applies significance scoring and diff conversion to all graph nodes
func (a *RootCauseAnalyzer) applyDiffFormat(graph *CausalGraph, failureTime time.Time, errorPatterns []string) {
for i := range graph.Nodes {
node := &graph.Nodes[i]
isOnSpine := node.NodeType == "SPINE"

// Score all events
ScoreEvents(node, isOnSpine, failureTime, errorPatterns)

// Convert AllEvents to diff format
if len(node.AllEvents) > 0 {
node.AllEvents = ConvertEventsToDiffFormat(node.AllEvents, true)
}

// Convert ChangeEvent to diff format
if node.ChangeEvent != nil && node.ChangeEvent.Data != nil {
ConvertSingleEventToDiff(node.ChangeEvent, nil, true)
}
}
}
Loading