From b67c761d658f50992470424b3f813d0832da8d0b Mon Sep 17 00:00:00 2001 From: Erik Date: Sun, 1 Feb 2026 00:15:47 +0100 Subject: [PATCH 1/2] Add detailed debugging and error handling documentation, including a comprehensive `Debugging Guide`, `Error Scenarios`, and an `Operations Runbook`. --- docs/operations/debugging-guide.md | 502 +++++++++++++++++++ docs/operations/error-scenarios.md | 459 ++++++++++++++++++ docs/operations/index.md | 128 +++++ docs/operations/monitoring-alerting.md | 646 +++++++++++++++++++++++++ docs/operations/recovery-procedures.md | 583 ++++++++++++++++++++++ docs/troubleshooting/index.md | 6 + 6 files changed, 2324 insertions(+) create mode 100644 docs/operations/debugging-guide.md create mode 100644 docs/operations/error-scenarios.md create mode 100644 docs/operations/index.md create mode 100644 docs/operations/monitoring-alerting.md create mode 100644 docs/operations/recovery-procedures.md diff --git a/docs/operations/debugging-guide.md b/docs/operations/debugging-guide.md new file mode 100644 index 0000000..ae8a0f6 --- /dev/null +++ b/docs/operations/debugging-guide.md @@ -0,0 +1,502 @@ +# Debugging Guide + +Systematic approach to diagnosing migration issues in Aether Datafixers. + +## Quick Reference + +| Need | Tool | Configuration | +|--------------------|---------------------|----------------------------| +| Basic logs | SLF4J | Set level to DEBUG | +| Detailed trace | `DiagnosticContext` | Enable with options | +| Per-fix snapshots | `DiagnosticOptions` | `captureSnapshots(true)` | +| Rule-level detail | `DiagnosticOptions` | `captureRuleDetails(true)` | +| Production minimal | `DiagnosticOptions` | `minimal()` preset | + +--- + +## SLF4J Configuration + +### Default Logger Name + +The default logger name for Aether Datafixers is: + +``` +de.splatgames.aether.datafixers +``` + +### Using Slf4jDataFixerContext + +Route datafixer logs through your application's logging framework: + +```java +import de.splatgames.aether.datafixers.core.fix.Slf4jDataFixerContext; + +// Option 1: Default logger name +DataFixerContext context = new Slf4jDataFixerContext(); + +// Option 2: Custom logger name +DataFixerContext context = new Slf4jDataFixerContext("com.myapp.migrations"); + +// Option 3: Existing logger +Logger logger = LoggerFactory.getLogger(MyMigrationService.class); +DataFixerContext context = new Slf4jDataFixerContext(logger); + +// Use in migration +fixer.update(typeRef, data, fromVersion, toVersion, context); +``` + +### Logback Configuration + +```xml + + + + + + %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n + + + + + + logs/migrations.log + + logs/migrations.%d{yyyy-MM-dd}.log + 30 + + + %d{ISO8601} [%thread] %-5level %logger{36} - %msg%n%ex{full} + + + + + + + + + + + + + + + + +``` + +### Log4j2 Configuration + +```xml + + + + + + + + + + + + + + + + + + + + + + + + + + + +``` + +### Production vs Development Settings + +| Environment | Logger Level | Snapshots | Rule Details | +|-------------|--------------|-----------|--------------| +| Development | DEBUG | Yes | Yes | +| Staging | INFO | Yes | No | +| Production | WARN | No | No | + +--- + +## MigrationReport Diagnostics + +### Enabling Diagnostics + +```java +import de.splatgames.aether.datafixers.api.diagnostic.DiagnosticContext; +import de.splatgames.aether.datafixers.api.diagnostic.DiagnosticOptions; +import de.splatgames.aether.datafixers.api.diagnostic.MigrationReport; + +// Full diagnostics for debugging +DiagnosticContext context = DiagnosticContext.create( + DiagnosticOptions.builder() + .captureSnapshots(true) + .captureRuleDetails(true) + .prettyPrintSnapshots(true) + .build() +); + +// Run migration +Dynamic result = fixer.update(typeRef, data, fromVersion, toVersion, context); + +// Get the report +MigrationReport report = context.getReport(); +``` + +### Report Fields Reference + +| Field | Method | Description | When to Use | +|-----------------|--------------------------|-------------------------|-----------------------| +| Type | `type()` | TypeReference migrated | Always | +| From Version | `fromVersion()` | Source version | Always | +| To Version | `toVersion()` | Target version | Always | +| Duration | `totalDuration()` | Total migration time | Performance issues | +| Fix Count | `fixCount()` | Number of fixes applied | Verify migration path | +| Fix Executions | `fixExecutions()` | Detailed fix list | Tracing issues | +| Rule Count | `ruleApplicationCount()` | Total rules evaluated | Deep debugging | +| Touched Types | `touchedTypes()` | All types processed | Complex migrations | +| Warnings | `warnings()` | Non-fatal issues | Data quality | +| Input Snapshot | `inputSnapshot()` | Data before migration | Transform debugging | +| Output Snapshot | `outputSnapshot()` | Data after migration | Transform debugging | + +### Reading the Report + +```java +MigrationReport report = context.getReport(); + +// Basic summary +System.out.println(report.toSummary()); +// Output: "Migration of 'player' from v100 to v200: 150ms, 5 fixes" + +// Detailed analysis +System.out.println("Type: " + report.type().getId()); +System.out.println("Version: " + report.fromVersion().getVersion() + + " -> " + report.toVersion().getVersion()); +System.out.println("Duration: " + report.totalDuration().toMillis() + "ms"); +System.out.println("Fixes Applied: " + report.fixCount()); +System.out.println("Rules Evaluated: " + report.ruleApplicationCount()); + +// Check for warnings +if (report.hasWarnings()) { + System.out.println("Warnings:"); + for (String warning : report.warnings()) { + System.out.println(" - " + warning); + } +} + +// Snapshots (if enabled) +report.inputSnapshot().ifPresent(snap -> + System.out.println("Input:\n" + snap)); +report.outputSnapshot().ifPresent(snap -> + System.out.println("Output:\n" + snap)); +``` + +--- + +## Tracing Fix Order + +### Understanding Fix Application + +Fixes are applied in version order, from `fromVersion` to `toVersion`. Each fix transforms data from one version to the next. + +``` +v100 ──[Fix A]──> v110 ──[Fix B]──> v150 ──[Fix C]──> v200 +``` + +### Listing Applied Fixes + +```java +MigrationReport report = context.getReport(); + +System.out.println("Applied fixes in order:"); +for (FixExecution fix : report.fixExecutions()) { + System.out.println(fix.toSummary()); + // Output: "rename_field (v100 -> v110): 5ms, 3 rules (2 matched)" +} +``` + +### Detailed Fix Analysis + +```java +for (FixExecution fix : report.fixExecutions()) { + System.out.println("\nFix: " + fix.fixName()); + System.out.println(" Version: " + fix.fromVersion().getVersion() + + " -> " + fix.toVersion().getVersion()); + System.out.println(" Duration: " + fix.durationMillis() + "ms"); + System.out.println(" Rules: " + fix.ruleCount() + + " (" + fix.matchedRuleCount() + " matched)"); + + // Per-fix snapshots + fix.beforeSnapshotOpt().ifPresent(snap -> + System.out.println(" Before: " + snap)); + fix.afterSnapshotOpt().ifPresent(snap -> + System.out.println(" After: " + snap)); + + // Rule-level details (if captureRuleDetails enabled) + for (RuleApplication rule : fix.ruleApplications()) { + System.out.println(" Rule: " + rule.ruleName() + + " on " + rule.typeName() + + " -> " + (rule.matched() ? "MATCHED" : "skipped") + + " (" + rule.durationMillis() + "ms)"); + } +} +``` + +### Finding a Specific Fix + +```java +// Find by name +Optional fix = report.fixExecutions().stream() + .filter(f -> f.fixName().equals("rename_player_field")) + .findFirst(); + +// Find by version +Optional fixAtVersion = report.fixExecutions().stream() + .filter(f -> f.fromVersion().getVersion() == 150) + .findFirst(); +``` + +--- + +## DiagnosticOptions + +### Available Presets + +| Preset | Snapshots | Rule Details | Pretty Print | Use Case | +|--------------|-----------|--------------|--------------|-----------------------| +| `defaults()` | Yes | Yes | Yes | Development debugging | +| `minimal()` | No | No | No | Production monitoring | + +```java +// Full diagnostics (development) +DiagnosticContext devContext = DiagnosticContext.create(DiagnosticOptions.defaults()); + +// Minimal overhead (production) +DiagnosticContext prodContext = DiagnosticContext.create(DiagnosticOptions.minimal()); + +// No diagnostics (maximum performance) +fixer.update(typeRef, data, fromVersion, toVersion); // No context +``` + +### Custom Configuration + +```java +DiagnosticOptions options = DiagnosticOptions.builder() + .captureSnapshots(true) // Enable before/after snapshots + .captureRuleDetails(true) // Enable per-rule tracking + .maxSnapshotLength(10000) // Truncate large snapshots (0 = unlimited) + .prettyPrintSnapshots(true) // Format JSON for readability + .build(); +``` + +### Snapshot Truncation + +Large data structures are truncated to prevent memory issues: + +```java +DiagnosticOptions options = DiagnosticOptions.builder() + .captureSnapshots(true) + .maxSnapshotLength(500) // Truncate to 500 characters + .build(); + +// Truncated snapshots end with "... (truncated)" +``` + +--- + +## Step-by-Step Debugging Workflow + +### 1. Reproduce the Issue + +```java +// Isolate a single problematic record +Dynamic problematicData = loadProblemRecord(); +DataVersion fromVersion = new DataVersion(100); +DataVersion toVersion = new DataVersion(200); +``` + +### 2. Enable Full Diagnostics + +```java +DiagnosticContext context = DiagnosticContext.create( + DiagnosticOptions.builder() + .captureSnapshots(true) + .captureRuleDetails(true) + .prettyPrintSnapshots(true) + .build() +); +``` + +### 3. Run Migration with Diagnostics + +```java +try { + Dynamic result = fixer.update(typeRef, problematicData, fromVersion, toVersion, context); + System.out.println("Migration succeeded"); +} catch (DataFixerException e) { + System.err.println("Migration failed: " + e.getMessage()); +} finally { + // Always get the report (even on failure) + MigrationReport report = context.getReport(); + analyzeReport(report); +} +``` + +### 4. Analyze the Report + +```java +private void analyzeReport(MigrationReport report) { + System.out.println("\n=== Migration Report ==="); + System.out.println(report.toSummary()); + + // Check warnings + if (report.hasWarnings()) { + System.out.println("\nWarnings:"); + report.warnings().forEach(w -> System.out.println(" - " + w)); + } + + // Find slow fixes + System.out.println("\nFix timing:"); + report.fixExecutions().stream() + .sorted((a, b) -> Long.compare(b.durationMillis(), a.durationMillis())) + .forEach(fix -> System.out.println(" " + fix.fixName() + ": " + fix.durationMillis() + "ms")); + + // Check for unmatched rules + long unmatchedRules = report.fixExecutions().stream() + .flatMap(fix -> fix.ruleApplications().stream()) + .filter(rule -> !rule.matched()) + .count(); + System.out.println("\nUnmatched rules: " + unmatchedRules); +} +``` + +### 5. Examine Snapshots + +```java +// Compare before/after for the failing fix +for (FixExecution fix : report.fixExecutions()) { + System.out.println("\n--- " + fix.fixName() + " ---"); + + fix.beforeSnapshotOpt().ifPresent(before -> { + System.out.println("BEFORE:"); + System.out.println(before); + }); + + fix.afterSnapshotOpt().ifPresent(after -> { + System.out.println("AFTER:"); + System.out.println(after); + }); +} +``` + +--- + +## Spring Boot Integration + +### Diagnostics via MigrationService + +```java +@Autowired +private MigrationService migrationService; + +public void migrateWithDiagnostics(TaggedDynamic data) { + DiagnosticContext context = DiagnosticContext.create(DiagnosticOptions.defaults()); + + MigrationResult result = migrationService + .migrate(data) + .from(100) + .to(200) + .withContext(context) + .execute(); + + // Analyze diagnostics + MigrationReport report = context.getReport(); + logReport(report); +} + +private void logReport(MigrationReport report) { + logger.info("Migration: {}", report.toSummary()); + + for (String warning : report.warnings()) { + logger.warn(" Warning: {}", warning); + } + + for (FixExecution fix : report.fixExecutions()) { + logger.debug(" Fix '{}': {}ms, {} rules ({} matched)", + fix.fixName(), + fix.durationMillis(), + fix.ruleCount(), + fix.matchedRuleCount()); + } +} +``` + +### Conditional Diagnostics in Production + +```java +@Value("${aether.datafixers.diagnostics.enabled:false}") +private boolean diagnosticsEnabled; + +public MigrationResult migrate(TaggedDynamic data) { + MigrationService.MigrationBuilder builder = migrationService + .migrate(data) + .from(100) + .to(200); + + if (diagnosticsEnabled) { + DiagnosticContext context = DiagnosticContext.create(DiagnosticOptions.minimal()); + builder.withContext(context); + } + + return builder.execute(); +} +``` + +--- + +## Common Debugging Scenarios + +### Scenario: Migration Produces Wrong Output + +1. Enable snapshots +2. Compare `inputSnapshot` with `outputSnapshot` +3. Check each fix's before/after snapshots +4. Identify which fix introduced the problem + +### Scenario: Migration is Slow + +1. Enable `DiagnosticOptions.minimal()` (low overhead) +2. Check `report.totalDuration()` +3. Sort fixes by duration +4. Profile the slowest fix + +```java +report.fixExecutions().stream() + .sorted((a, b) -> Long.compare(b.durationMillis(), a.durationMillis())) + .limit(5) + .forEach(fix -> System.out.println(fix.fixName() + ": " + fix.durationMillis() + "ms")); +``` + +### Scenario: Warning During Migration + +1. Check `report.warnings()` +2. Enable rule details to see which rule emitted the warning +3. Review the fix implementation for `context.warn()` calls + +--- + +## Related + +- [Error Scenarios](error-scenarios.md) — Exception handling reference +- [How to Use Diagnostics](../how-to/use-diagnostics.md) — Full API reference +- [How to Debug Migrations](../how-to/debug-migrations.md) — Basic debugging tips +- [Monitoring & Alerting](monitoring-alerting.md) — Production monitoring diff --git a/docs/operations/error-scenarios.md b/docs/operations/error-scenarios.md new file mode 100644 index 0000000..2613a62 --- /dev/null +++ b/docs/operations/error-scenarios.md @@ -0,0 +1,459 @@ +# Error Scenarios + +Detailed guide to handling exceptions in Aether Datafixers production environments. + +## Exception Hierarchy Quick Reference + +| Exception | Context Fields | Common Causes | +|----------------------|--------------------------------------------------------|---------------------------| +| `DataFixerException` | `context` | Base class for all errors | +| `FixException` | `fixName`, `fromVersion`, `toVersion`, `typeReference` | Fix logic failure | +| `DecodeException` | `typeReference`, `path` | Invalid input data | +| `EncodeException` | `typeReference`, `failedValue` | Serialization failure | +| `RegistryException` | `missingType`, `missingVersion` | Missing registration | + +All exceptions extend `RuntimeException` (unchecked) and are immutable/thread-safe. + +--- + +## FixException + +Thrown when a DataFix fails to transform data from one version to another. + +### Context Fields + +| Field | Accessor | Description | +|-----------------|----------------------|-----------------------------| +| `fixName` | `getFixName()` | Name of the fix that failed | +| `fromVersion` | `getFromVersion()` | Source version of migration | +| `toVersion` | `getToVersion()` | Target version of migration | +| `typeReference` | `getTypeReference()` | Type being transformed | + +### Context String Format + +``` +fix=rename_player_name, version=100->200, type=player +``` + +### Common Causes + +1. **Invalid input data** — Data doesn't match expected schema +2. **Missing required field** — Fix expects a field that doesn't exist +3. **Type mismatch** — Expected string but found number +4. **Rule application failure** — TypeRewriteRule failed to apply +5. **Null pointer** — Fix logic encountered null unexpectedly + +### Resolution Steps + +```java +try { + Dynamic result = fixer.update(typeRef, data, fromVersion, toVersion); +} catch (FixException e) { + // 1. Log with full context + logger.error("Migration failed: {} [{}]", e.getMessage(), e.getContext()); + + // 2. Extract specific fields for analysis + if (e.getFixName() != null) { + logger.error(" Fix: {}", e.getFixName()); + } + if (e.getFromVersion() != null && e.getToVersion() != null) { + logger.error(" Version: {} -> {}", + e.getFromVersion().getVersion(), + e.getToVersion().getVersion()); + } + if (e.getTypeReference() != null) { + logger.error(" Type: {}", e.getTypeReference().getId()); + } + + // 3. Check root cause + if (e.getCause() != null) { + logger.error(" Root cause: {}", e.getCause().getMessage()); + } +} +``` + +### Diagnostic Integration + +```java +// Use DiagnosticContext to capture snapshots +DiagnosticContext ctx = DiagnosticContext.create( + DiagnosticOptions.builder() + .captureSnapshots(true) + .build() +); + +try { + fixer.update(typeRef, data, fromVersion, toVersion, ctx); +} catch (FixException e) { + MigrationReport report = ctx.getReport(); + + // Find which fix ran last (the one that failed) + List fixes = report.fixExecutions(); + if (!fixes.isEmpty()) { + FixExecution lastFix = fixes.get(fixes.size() - 1); + logger.error("Last fix before failure: {}", lastFix.fixName()); + lastFix.beforeSnapshotOpt().ifPresent(snap -> + logger.error("Data before fix: {}", snap)); + } +} +``` + +--- + +## DecodeException + +Thrown when deserialization from Dynamic to typed Java object fails. + +### Context Fields + +| Field | Accessor | Description | +|-----------------|----------------------|-------------------------------------------| +| `typeReference` | `getTypeReference()` | Type being decoded | +| `path` | `getPath()` | Location in data structure (dot notation) | + +### Context String Format + +``` +type=player, path=inventory[0].item.name +``` + +### Path Notation + +The path uses dot notation with array indices: +- `player.name` — Field `name` in object `player` +- `inventory[0]` — First element of array `inventory` +- `inventory[0].item.damage` — Nested field access + +### Common Causes + +1. **Missing required field** — Schema expects field that doesn't exist +2. **Invalid field type** — Expected number, got string +3. **Malformed data** — Corrupt or truncated input +4. **Schema mismatch** — Data version doesn't match expected schema +5. **Null value** — Non-nullable field is null + +### Resolution Steps + +```java +try { + Typed typed = fixer.decode(version, typeRef, dynamic); +} catch (DecodeException e) { + logger.error("Decode failed: {} [{}]", e.getMessage(), e.getContext()); + + // Path tells you exactly where the problem is + if (e.getPath() != null) { + logger.error("Problem location: {}", e.getPath()); + + // Navigate to the problematic field + String[] pathParts = e.getPath().split("\\."); + Dynamic current = dynamic; + for (String part : pathParts) { + if (part.contains("[")) { + // Array access + String fieldName = part.substring(0, part.indexOf('[')); + int index = Integer.parseInt( + part.substring(part.indexOf('[') + 1, part.indexOf(']'))); + current = current.get(fieldName).get(index); + } else { + current = current.get(part); + } + logger.debug(" {} = {}", part, current.getValue()); + } + } +} +``` + +### Data Inspection + +```java +// Inspect the raw data at the failing path +DecodeException e = ...; +if (e.getPath() != null && e.getPath().contains(".")) { + String parentPath = e.getPath().substring(0, e.getPath().lastIndexOf('.')); + String fieldName = e.getPath().substring(e.getPath().lastIndexOf('.') + 1); + + logger.error("Parent object fields at '{}': {}", parentPath, + navigateTo(dynamic, parentPath).asMap().keySet()); +} +``` + +--- + +## EncodeException + +Thrown when serialization from Java object to Dynamic representation fails. + +### Context Fields + +| Field | Accessor | Description | +|-----------------|----------------------|-----------------------------| +| `typeReference` | `getTypeReference()` | Type being encoded | +| `failedValue` | `getFailedValue()` | Value that failed to encode | + +### Context String Format + +``` +type=player +``` + +### Common Causes + +1. **Null value** — Required field is null +2. **Unsupported type** — Codec doesn't support the value type +3. **Codec misconfiguration** — Encoder not properly set up +4. **Circular reference** — Object graph contains cycles + +### Resolution Steps + +```java +try { + Dynamic encoded = fixer.encode(version, typeRef, value, ops); +} catch (EncodeException e) { + logger.error("Encode failed: {} [{}]", e.getMessage(), e.getContext()); + + // Inspect the failed value (be careful with sensitive data) + if (e.getFailedValue() != null) { + logger.error("Failed value class: {}", e.getFailedValue().getClass().getName()); + // Only log non-sensitive values + if (isSafeToLog(e.getFailedValue())) { + logger.error("Failed value: {}", e.getFailedValue()); + } + } + + // Check if it's a null issue + if (e.getCause() instanceof NullPointerException) { + logger.error("Null value encountered - check required fields"); + } +} +``` + +### Sensitive Data Warning + +The `failedValue` may contain sensitive information (passwords, tokens, PII). Always sanitize before logging: + +```java +private boolean isSafeToLog(Object value) { + // Don't log objects that might contain sensitive data + if (value instanceof String) { + String str = (String) value; + return str.length() < 100 && !str.toLowerCase().contains("password"); + } + return value instanceof Number || value instanceof Boolean; +} +``` + +--- + +## RegistryException + +Thrown when a registry lookup fails (type, schema, or codec not found). + +### Context Fields + +| Field | Accessor | Description | +|------------------|-----------------------|-------------------------| +| `missingType` | `getMissingType()` | TypeReference not found | +| `missingVersion` | `getMissingVersion()` | DataVersion not found | + +### Context String Format + +``` +type=custom_entity, version=150 +``` + +### Common Causes + +1. **Type not registered** — Forgot to register type in bootstrap +2. **Schema not registered** — Version not registered in SchemaRegistry +3. **Version gap** — No schema exists for intermediate version +4. **Typo in TypeReference** — Type ID doesn't match registration + +### Resolution Steps + +```java +try { + Schema schema = schemaRegistry.require(version); +} catch (RegistryException e) { + logger.error("Registry lookup failed: {} [{}]", e.getMessage(), e.getContext()); + + if (e.getMissingVersion() != null) { + logger.error("Missing schema for version: {}", + e.getMissingVersion().getVersion()); + + // List available versions + logger.info("Available versions: {}", + schemaRegistry.getVersions().stream() + .map(v -> String.valueOf(v.getVersion())) + .collect(Collectors.joining(", "))); + } + + if (e.getMissingType() != null) { + logger.error("Missing type: {}", e.getMissingType().getId()); + + // List registered types (at current version if available) + logger.info("Registered types: {}", + typeRegistry.getRegisteredTypes().stream() + .map(TypeReference::getId) + .collect(Collectors.joining(", "))); + } +} +``` + +### Bootstrap Verification Checklist + +When encountering RegistryException: + +- [ ] Check `registerSchemas()` includes the required version +- [ ] Check type is registered in the schema for that version +- [ ] Verify no gaps in version chain (e.g., 100 -> 200 needs fixes, not just schemas) +- [ ] Check for typos in TypeReference IDs +- [ ] Verify bootstrap is loaded (not null) + +--- + +## Schema Mismatch Scenarios + +### Data Version Doesn't Match Expected + +**Symptom**: Migration produces unexpected results or fails silently. + +**Detection**: + +```java +// Check data version before migration +Optional dataVersion = dynamic.get("_version").asNumber() + .map(Number::intValue); + +if (dataVersion.isEmpty()) { + logger.warn("Data has no version field - assuming oldest version"); +} + +int fromVersion = dataVersion.orElse(OLDEST_VERSION); +if (fromVersion > currentVersion.getVersion()) { + throw new IllegalStateException( + "Data version " + fromVersion + " is newer than current " + currentVersion); +} +``` + +### Type Structure Changed Without Fix + +**Symptom**: Fields missing or have wrong type after migration. + +**Detection**: + +```java +// Use SchemaValidator to detect coverage gaps +ValidationResult result = SchemaValidator.forBootstrap(bootstrap) + .validateFixCoverage() + .validate(); + +if (!result.isValid()) { + for (String error : result.getErrors()) { + logger.error("Schema validation error: {}", error); + } +} +``` + +**Resolution**: Write a DataFix to handle the schema change. + +--- + +## Extracting Exception Context + +### Complete Context Extraction + +```java +public class ExceptionAnalyzer { + + public static void logException(DataFixerException e) { + StringBuilder sb = new StringBuilder(); + sb.append("Exception: ").append(e.getClass().getSimpleName()).append("\n"); + sb.append("Message: ").append(e.getMessage()).append("\n"); + + if (e.getContext() != null) { + sb.append("Context: ").append(e.getContext()).append("\n"); + } + + // Type-specific extraction + if (e instanceof FixException fix) { + if (fix.getFixName() != null) { + sb.append("Fix Name: ").append(fix.getFixName()).append("\n"); + } + if (fix.getFromVersion() != null) { + sb.append("From Version: ").append(fix.getFromVersion().getVersion()).append("\n"); + } + if (fix.getToVersion() != null) { + sb.append("To Version: ").append(fix.getToVersion().getVersion()).append("\n"); + } + if (fix.getTypeReference() != null) { + sb.append("Type: ").append(fix.getTypeReference().getId()).append("\n"); + } + } else if (e instanceof DecodeException decode) { + if (decode.getTypeReference() != null) { + sb.append("Type: ").append(decode.getTypeReference().getId()).append("\n"); + } + if (decode.getPath() != null) { + sb.append("Path: ").append(decode.getPath()).append("\n"); + } + } else if (e instanceof EncodeException encode) { + if (encode.getTypeReference() != null) { + sb.append("Type: ").append(encode.getTypeReference().getId()).append("\n"); + } + // Be careful with failedValue - may contain sensitive data + } else if (e instanceof RegistryException registry) { + if (registry.getMissingType() != null) { + sb.append("Missing Type: ").append(registry.getMissingType().getId()).append("\n"); + } + if (registry.getMissingVersion() != null) { + sb.append("Missing Version: ").append(registry.getMissingVersion().getVersion()).append("\n"); + } + } + + // Root cause chain + Throwable cause = e.getCause(); + int depth = 0; + while (cause != null && depth < 5) { + sb.append("Caused by: ").append(cause.getClass().getSimpleName()) + .append(": ").append(cause.getMessage()).append("\n"); + cause = cause.getCause(); + depth++; + } + + System.err.println(sb); + } +} +``` + +### Logging Pattern for Production + +```xml + +%d{ISO8601} [%thread] %-5level %logger{36} - %msg%n%ex{full} +``` + +```java +// Structured logging with MDC +import org.slf4j.MDC; + +try { + fixer.update(typeRef, data, fromVersion, toVersion); +} catch (FixException e) { + MDC.put("fix_name", e.getFixName()); + MDC.put("from_version", String.valueOf(e.getFromVersion())); + MDC.put("to_version", String.valueOf(e.getToVersion())); + MDC.put("type", e.getTypeReference() != null ? e.getTypeReference().getId() : "unknown"); + + logger.error("Migration failed", e); + + MDC.clear(); +} +``` + +--- + +## Related + +- [Debugging Guide](debugging-guide.md) — Systematic debugging approach +- [Recovery Procedures](recovery-procedures.md) — How to recover from failures +- [Common Errors](../troubleshooting/common-errors.md) — Quick error reference +- [How to Use Diagnostics](../how-to/use-diagnostics.md) — Diagnostic API reference diff --git a/docs/operations/index.md b/docs/operations/index.md new file mode 100644 index 0000000..9de9d5e --- /dev/null +++ b/docs/operations/index.md @@ -0,0 +1,128 @@ +# Operations Runbook + +Operational guidance for running Aether Datafixers in production environments. This runbook covers error handling, debugging, monitoring, and recovery procedures. + +## Quick Reference + +| Scenario | Document | Key Actions | +|--------------------------------|-------------------------------------------------|---------------------------------------| +| Migration fails with exception | [Error Scenarios](error-scenarios.md) | Extract context, check exception type | +| Need detailed migration trace | [Debugging Guide](debugging-guide.md) | Enable `DiagnosticContext` | +| Set up production monitoring | [Monitoring & Alerting](monitoring-alerting.md) | Configure Micrometer metrics | +| Partial migration / data loss | [Recovery Procedures](recovery-procedures.md) | Restore from backup, retry | + +## Documentation Structure + +### [Error Scenarios](error-scenarios.md) + +Exception handling reference for production troubleshooting: +- Exception hierarchy and context fields +- `FixException` — Migration logic failures +- `DecodeException` — Deserialization failures +- `EncodeException` — Serialization failures +- `RegistryException` — Missing type or version +- Schema mismatch detection and resolution + +### [Debugging Guide](debugging-guide.md) + +Systematic approach to diagnosing migration issues: +- SLF4J configuration (Logback, Log4j2) +- Using `MigrationReport` for diagnostics +- Tracing fix application order +- Step-by-step debugging workflow + +### [Monitoring & Alerting](monitoring-alerting.md) + +Production monitoring setup: +- Micrometer metrics reference +- Recommended alert thresholds +- Prometheus alerting rules +- Grafana dashboard templates +- Actuator health integration + +### [Recovery Procedures](recovery-procedures.md) + +Handling failures and data recovery: +- Backup recommendations +- Partial migration recovery +- Rollback strategies +- Incident response workflows + +--- + +## Emergency Response + +### Migration Completely Failed + +1. **Check metrics** — Look at `aether.datafixers.migrations.failure` counter +2. **Extract context** — See [Error Scenarios](error-scenarios.md#extracting-exception-context) +3. **Enable DEBUG** — Set log level for `de.splatgames.aether.datafixers` +4. **Capture diagnostics** — Use `DiagnosticContext` on a sample record +5. **Restore if needed** — See [Recovery Procedures](recovery-procedures.md) + +### High Failure Rate Alert + +1. **Check error breakdown** — Query failures by `error_type` tag +2. **Identify pattern** — Same exception? Same data version? +3. **Isolate bad records** — Query for records at problematic version +4. **Apply targeted fix** — Fix data or code, retry migration + +### Slow Migration Alert + +1. **Check version span** — Large version jumps take longer +2. **Profile fixes** — Use `MigrationReport.fixExecutions()` timing +3. **Check data size** — Large objects slow down processing +4. **Consider batching** — Process in smaller batches + +--- + +## Health Checks + +### Actuator Endpoints + +| Endpoint | Purpose | +|------------------------|---------------------------| +| `/actuator/health` | UP/DOWN status per domain | +| `/actuator/info` | Version information | +| `/actuator/datafixers` | Detailed domain status | +| `/actuator/prometheus` | Metrics export | + +### Kubernetes Probes + +```yaml +livenessProbe: + httpGet: + path: /actuator/health/liveness + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + +readinessProbe: + httpGet: + path: /actuator/health/readiness + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 +``` + +--- + +## Key Metrics Overview + +| Metric | Type | Alert Threshold | +|---------------------------------------------|--------------|-----------------| +| `aether.datafixers.migrations.success` | Counter | — | +| `aether.datafixers.migrations.failure` | Counter | > 0 per minute | +| `aether.datafixers.migrations.duration` | Timer | p99 > 1s | +| `aether.datafixers.migrations.version.span` | Distribution | avg > 50 | + +See [Monitoring & Alerting](monitoring-alerting.md) for complete metrics reference. + +--- + +## Related + +- [Troubleshooting](../troubleshooting/index.md) — Basic troubleshooting tips +- [Spring Boot Metrics](../spring-boot/metrics.md) — Detailed metrics reference +- [Spring Boot Actuator](../spring-boot/actuator.md) — Actuator integration +- [How to Use Diagnostics](../how-to/use-diagnostics.md) — Diagnostic API reference diff --git a/docs/operations/monitoring-alerting.md b/docs/operations/monitoring-alerting.md new file mode 100644 index 0000000..2c0b9ae --- /dev/null +++ b/docs/operations/monitoring-alerting.md @@ -0,0 +1,646 @@ +# Monitoring & Alerting + +Production monitoring setup for Aether Datafixers using Micrometer, Prometheus, and Grafana. + +## Metric Quick Reference + +| Metric | Type | Tags | Alert Threshold | Description | +|---------------------------------------------|--------------|------------------------|-----------------|-----------------------| +| `aether.datafixers.migrations.success` | Counter | `domain` | — | Successful migrations | +| `aether.datafixers.migrations.failure` | Counter | `domain`, `error_type` | > 0/min | Failed migrations | +| `aether.datafixers.migrations.duration` | Timer | `domain` | p99 > 1s | Execution time | +| `aether.datafixers.migrations.version.span` | Distribution | `domain` | avg > 50 | Version distance | + +All metrics use the prefix `aether.datafixers.migrations`. + +--- + +## Metric Details + +### Success Counter + +Tracks total successful migrations per domain. + +**Prometheus format:** +``` +aether_datafixers_migrations_success_total{domain="game"} 1234 +``` + +**Use cases:** +- Calculate success rate +- Monitor throughput +- Track migration activity + +### Failure Counter + +Tracks failed migrations with error type breakdown. + +**Prometheus format:** +``` +aether_datafixers_migrations_failure_total{domain="game",error_type="FixException"} 5 +aether_datafixers_migrations_failure_total{domain="game",error_type="DecodeException"} 2 +``` + +**Tags:** +- `domain` — DataFixer domain name +- `error_type` — Exception class simple name + +### Duration Timer + +Tracks execution time distribution (includes both success and failure). + +**Prometheus format:** +``` +aether_datafixers_migrations_duration_seconds_count{domain="game"} 1239 +aether_datafixers_migrations_duration_seconds_sum{domain="game"} 185.7 +aether_datafixers_migrations_duration_seconds_bucket{domain="game",le="0.01"} 500 +aether_datafixers_migrations_duration_seconds_bucket{domain="game",le="0.1"} 1100 +aether_datafixers_migrations_duration_seconds_bucket{domain="game",le="1.0"} 1230 +aether_datafixers_migrations_duration_seconds_bucket{domain="game",le="+Inf"} 1239 +``` + +### Version Span Distribution + +Tracks the distance between source and target versions (indicates data age). + +**Prometheus format:** +``` +aether_datafixers_migrations_version_span_count{domain="game"} 1234 +aether_datafixers_migrations_version_span_sum{domain="game"} 45600 +aether_datafixers_migrations_version_span_max{domain="game"} 150 +``` + +--- + +## Recommended Alert Thresholds + +### Critical Alerts (Page On-Call) + +| Alert | Condition | Duration | Action | +|------------------------|---------------------|----------|-------------------------| +| High Failure Rate | > 5% failures | 5m | Immediate investigation | +| All Migrations Failing | 100% failure rate | 2m | Emergency response | +| Service Down | No metrics reported | 5m | Check service health | + +### Warning Alerts (Notify Team) + +| Alert | Condition | Duration | Action | +|-----------------------|---------------|----------|-----------------------------------| +| Elevated Failure Rate | > 1% failures | 5m | Investigate during business hours | +| Slow Migrations | p95 > 1s | 5m | Performance review | +| Very Slow Migrations | p99 > 5s | 5m | Profile and optimize | + +### Informational Alerts (Dashboard/Log) + +| Alert | Condition | Duration | Action | +|--------------------|----------------|----------|-----------------------------| +| Large Version Span | avg span > 50 | 1h | Review data freshness | +| Very Large Span | max span > 200 | 1h | Identify stale data sources | +| No Activity | 0 migrations | 1h | Verify expected behavior | + +--- + +## Prometheus Alert Rules + +### Complete Alert Configuration + +```yaml +groups: + - name: aether-datafixers-critical + rules: + # High failure rate - immediate attention + - alert: DataFixerHighFailureRate + expr: | + ( + sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain) + / ( + sum(rate(aether_datafixers_migrations_success_total[5m])) by (domain) + + sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain) + ) + ) > 0.05 + for: 5m + labels: + severity: critical + annotations: + summary: "Critical: DataFixer failure rate > 5% in domain {{ $labels.domain }}" + description: "Failure rate is {{ $value | humanizePercentage }}. Check error logs and metrics." + runbook_url: "https://docs.example.com/runbooks/datafixer-high-failure" + + # All migrations failing + - alert: DataFixerAllFailing + expr: | + sum(rate(aether_datafixers_migrations_success_total[2m])) by (domain) == 0 + and sum(rate(aether_datafixers_migrations_failure_total[2m])) by (domain) > 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Critical: All migrations failing in domain {{ $labels.domain }}" + description: "Zero successful migrations with active failures. Immediate attention required." + runbook_url: "https://docs.example.com/runbooks/datafixer-total-failure" + + - name: aether-datafixers-warning + rules: + # Elevated failure rate + - alert: DataFixerElevatedFailureRate + expr: | + ( + sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain) + / ( + sum(rate(aether_datafixers_migrations_success_total[5m])) by (domain) + + sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain) + ) + ) > 0.01 + for: 5m + labels: + severity: warning + annotations: + summary: "Warning: DataFixer failure rate > 1% in domain {{ $labels.domain }}" + description: "Failure rate is {{ $value | humanizePercentage }}. Investigate soon." + + # Slow migrations (p95) + - alert: DataFixerSlowMigrations + expr: | + histogram_quantile(0.95, + sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain) + ) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Warning: Slow migrations in domain {{ $labels.domain }}" + description: "p95 migration duration is {{ $value | humanizeDuration }}. Review performance." + + # Very slow migrations (p99) + - alert: DataFixerVerySlowMigrations + expr: | + histogram_quantile(0.99, + sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain) + ) > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Warning: Very slow migrations in domain {{ $labels.domain }}" + description: "p99 migration duration is {{ $value | humanizeDuration }}. Profile and optimize." + + - name: aether-datafixers-info + rules: + # Large version span + - alert: DataFixerLargeVersionSpan + expr: | + ( + rate(aether_datafixers_migrations_version_span_sum[1h]) + / rate(aether_datafixers_migrations_version_span_count[1h]) + ) > 50 + for: 1h + labels: + severity: info + annotations: + summary: "Info: Large version span in domain {{ $labels.domain }}" + description: "Average span is {{ $value }} versions. Consider data freshness review." + + # No migration activity + - alert: DataFixerNoActivity + expr: | + sum(rate(aether_datafixers_migrations_success_total[1h])) by (domain) == 0 + and sum(rate(aether_datafixers_migrations_failure_total[1h])) by (domain) == 0 + for: 1h + labels: + severity: info + annotations: + summary: "Info: No migration activity in domain {{ $labels.domain }}" + description: "No migrations in the last hour. Verify this is expected." +``` + +--- + +## Grafana Dashboard + +### Complete Dashboard JSON + +```json +{ + "title": "Aether DataFixers Operations", + "uid": "aether-datafixers-ops", + "tags": ["aether", "datafixers", "migrations"], + "timezone": "browser", + "refresh": "30s", + "panels": [ + { + "title": "Migration Rate", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "targets": [ + { + "expr": "sum(rate(aether_datafixers_migrations_success_total[5m])) by (domain)", + "legendFormat": "Success ({{domain}})" + }, + { + "expr": "sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain)", + "legendFormat": "Failure ({{domain}})" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + } + } + }, + { + "title": "Success Rate", + "type": "gauge", + "gridPos": {"h": 8, "w": 6, "x": 12, "y": 0}, + "targets": [ + { + "expr": "(sum(rate(aether_datafixers_migrations_success_total[1h])) / (sum(rate(aether_datafixers_migrations_success_total[1h])) + sum(rate(aether_datafixers_migrations_failure_total[1h])))) * 100" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "min": 0, + "max": 100, + "thresholds": { + "steps": [ + {"color": "red", "value": 0}, + {"color": "yellow", "value": 95}, + {"color": "green", "value": 99} + ] + } + } + } + }, + { + "title": "Current Error Rate", + "type": "stat", + "gridPos": {"h": 8, "w": 6, "x": 18, "y": 0}, + "targets": [ + { + "expr": "sum(rate(aether_datafixers_migrations_failure_total[5m])) / (sum(rate(aether_datafixers_migrations_success_total[5m])) + sum(rate(aether_datafixers_migrations_failure_total[5m]))) * 100" + } + ], + "fieldConfig": { + "defaults": { + "unit": "percent", + "thresholds": { + "steps": [ + {"color": "green", "value": 0}, + {"color": "yellow", "value": 1}, + {"color": "red", "value": 5} + ] + } + } + } + }, + { + "title": "Migration Duration Percentiles", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain))", + "legendFormat": "p50 ({{domain}})" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain))", + "legendFormat": "p95 ({{domain}})" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain))", + "legendFormat": "p99 ({{domain}})" + } + ], + "fieldConfig": { + "defaults": { + "unit": "s" + } + } + }, + { + "title": "Version Span Distribution", + "type": "timeseries", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}, + "targets": [ + { + "expr": "rate(aether_datafixers_migrations_version_span_sum[5m]) / rate(aether_datafixers_migrations_version_span_count[5m])", + "legendFormat": "Avg Span ({{domain}})" + }, + { + "expr": "aether_datafixers_migrations_version_span_max", + "legendFormat": "Max Span ({{domain}})" + } + ] + }, + { + "title": "Failures by Error Type", + "type": "piechart", + "gridPos": {"h": 8, "w": 8, "x": 0, "y": 16}, + "targets": [ + { + "expr": "sum(increase(aether_datafixers_migrations_failure_total[24h])) by (error_type)", + "legendFormat": "{{error_type}}" + } + ] + }, + { + "title": "Failure Rate by Domain", + "type": "timeseries", + "gridPos": {"h": 8, "w": 8, "x": 8, "y": 16}, + "targets": [ + { + "expr": "sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain)", + "legendFormat": "{{domain}}" + } + ], + "fieldConfig": { + "defaults": { + "unit": "ops" + } + } + }, + { + "title": "Recent Errors (Last 1h)", + "type": "table", + "gridPos": {"h": 8, "w": 8, "x": 16, "y": 16}, + "targets": [ + { + "expr": "sum(increase(aether_datafixers_migrations_failure_total[1h])) by (domain, error_type) > 0", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "renameByName": { + "domain": "Domain", + "error_type": "Error Type", + "Value": "Count" + } + } + } + ] + } + ] +} +``` + +### Dashboard Import + +1. Go to Grafana > Dashboards > Import +2. Paste the JSON above +3. Select your Prometheus data source +4. Click Import + +--- + +## Actuator Integration + +### Health Endpoint + +The DataFixer health indicator reports UP/DOWN status per domain. + +**Request:** +```bash +curl http://localhost:8080/actuator/health +``` + +**Response:** +```json +{ + "status": "UP", + "components": { + "datafixer": { + "status": "UP", + "details": { + "totalDomains": 2, + "default.status": "UP", + "default.currentVersion": 200, + "game.status": "UP", + "game.currentVersion": 150 + } + } + } +} +``` + +### Custom Endpoint + +Get detailed DataFixer information at `/actuator/datafixers`: + +```bash +curl http://localhost:8080/actuator/datafixers +``` + +```json +{ + "domains": { + "default": { + "currentVersion": 200, + "status": "UP" + }, + "game": { + "currentVersion": 150, + "status": "UP" + } + } +} +``` + +### Kubernetes Probes + +```yaml +apiVersion: v1 +kind: Pod +spec: + containers: + - name: app + livenessProbe: + httpGet: + path: /actuator/health/liveness + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 10 + failureThreshold: 3 + + readinessProbe: + httpGet: + path: /actuator/health/readiness + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 3 + + startupProbe: + httpGet: + path: /actuator/health + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + failureThreshold: 30 +``` + +### Prometheus Scraping Actuator + +```yaml +# prometheus.yml +scrape_configs: + - job_name: 'spring-actuator' + metrics_path: '/actuator/prometheus' + static_configs: + - targets: ['app:8080'] + scrape_interval: 15s +``` + +--- + +## Multi-Domain Monitoring + +### Per-Domain Dashboards + +Use Grafana variables to filter by domain: + +```json +{ + "templating": { + "list": [ + { + "name": "domain", + "type": "query", + "query": "label_values(aether_datafixers_migrations_success_total, domain)", + "refresh": 2 + } + ] + } +} +``` + +Then use `domain=~\"$domain\"` in queries: + +```promql +sum(rate(aether_datafixers_migrations_success_total{domain=~"$domain"}[5m])) +``` + +### Cross-Domain Comparison + +Compare performance across domains: + +```promql +# Success rate by domain +( + sum(rate(aether_datafixers_migrations_success_total[1h])) by (domain) + / ( + sum(rate(aether_datafixers_migrations_success_total[1h])) by (domain) + + sum(rate(aether_datafixers_migrations_failure_total[1h])) by (domain) + ) +) * 100 +``` + +--- + +## PagerDuty Integration + +### Alertmanager Configuration + +```yaml +# alertmanager.yml +global: + pagerduty_url: 'https://events.pagerduty.com/v2/enqueue' + +route: + receiver: 'default' + routes: + - match: + severity: critical + receiver: 'pagerduty-critical' + - match: + severity: warning + receiver: 'slack-warning' + +receivers: + - name: 'default' + email_configs: + - to: 'team@example.com' + + - name: 'pagerduty-critical' + pagerduty_configs: + - service_key: '' + description: '{{ .CommonAnnotations.summary }}' + details: + runbook: '{{ .CommonAnnotations.runbook_url }}' + domain: '{{ .CommonLabels.domain }}' + + - name: 'slack-warning' + slack_configs: + - api_url: '' + channel: '#alerts' + title: '{{ .CommonAnnotations.summary }}' + text: '{{ .CommonAnnotations.description }}' +``` + +--- + +## Application Configuration + +### Enable Metrics + +```yaml +# application.yml +aether: + datafixers: + enabled: true + metrics: + timing: true + counting: true + +management: + endpoints: + web: + exposure: + include: health, info, prometheus, datafixers + metrics: + export: + prometheus: + enabled: true + endpoint: + health: + show-details: always +``` + +### Custom Metrics Extension + +```java +@Component +public class ExtendedMigrationMetrics extends MigrationMetrics { + + private final Counter largeSpanCounter; + + public ExtendedMigrationMetrics(MeterRegistry registry) { + super(registry); + this.largeSpanCounter = Counter.builder("aether.datafixers.migrations.large_span") + .description("Migrations with version span > 100") + .register(registry); + } + + @Override + public void recordSuccess(String domain, int fromVersion, int toVersion, Duration duration) { + super.recordSuccess(domain, fromVersion, toVersion, duration); + + // Track large version spans separately + if (Math.abs(toVersion - fromVersion) > 100) { + largeSpanCounter.increment(); + } + } +} +``` + +--- + +## Related + +- [Spring Boot Metrics](../spring-boot/metrics.md) — Complete metrics reference +- [Spring Boot Actuator](../spring-boot/actuator.md) — Actuator integration +- [Debugging Guide](debugging-guide.md) — Diagnosing issues +- [Recovery Procedures](recovery-procedures.md) — Responding to alerts diff --git a/docs/operations/recovery-procedures.md b/docs/operations/recovery-procedures.md new file mode 100644 index 0000000..43b5eac --- /dev/null +++ b/docs/operations/recovery-procedures.md @@ -0,0 +1,583 @@ +# Recovery Procedures + +How to recover from migration failures, data issues, and production incidents. + +## Quick Reference + +| Scenario | Procedure | Complexity | +|--------------------------|------------------------|------------| +| Single record failure | Retry with diagnostics | Low | +| Batch failure (< 5%) | Isolate and retry | Medium | +| High failure rate (> 5%) | Stop, investigate, fix | High | +| Data corruption | Restore from backup | High | +| Schema mismatch | Version alignment | Medium | + +--- + +## Backup Recommendations + +### Pre-Migration Backup Strategy + +**Before major version bumps:** +1. Create full database backup +2. Verify backup integrity (test restore) +3. Document current schema version +4. Keep backup for rollback window (e.g., 7 days) + +**Before routine operations:** +1. Enable point-in-time recovery +2. Verify incremental backups are current +3. Document migration batch parameters + +### Backup Checklist + +```markdown +## Pre-Migration Backup Checklist + +- [ ] Database backup completed +- [ ] Backup verified (test restore on staging) +- [ ] Backup retention policy confirmed +- [ ] Schema version documented in backup metadata +- [ ] Rollback procedure documented +- [ ] Team notified of migration window +``` + +### Database Backup Patterns + +**PostgreSQL:** +```bash +# Full backup before migration +pg_dump -Fc -f backup_v100_$(date +%Y%m%d).dump mydb + +# With version in filename +pg_dump -Fc -f backup_schema_v100_to_v200_$(date +%Y%m%d_%H%M%S).dump mydb +``` + +**MongoDB:** +```bash +# Full backup +mongodump --db mydb --out ./backup_v100_$(date +%Y%m%d) + +# Specific collection +mongodump --db mydb --collection players --out ./backup_players_v100 +``` + +### Application-Level Snapshots + +```java +// Create pre-migration snapshot for critical records +public void createMigrationSnapshot(List recordIds) { + Path snapshotDir = Path.of("snapshots", + "migration_" + System.currentTimeMillis()); + Files.createDirectories(snapshotDir); + + for (String id : recordIds) { + Dynamic data = loadRecord(id); + Path file = snapshotDir.resolve(id + ".json"); + Files.writeString(file, serializeToJson(data)); + } + + logger.info("Created snapshot of {} records at {}", + recordIds.size(), snapshotDir); +} +``` + +--- + +## Partial Migration Recovery + +### Detecting Partial Migrations + +**Symptoms:** +- Some records at old version, some at new version +- Inconsistent data across related entities +- `aether.datafixers.migrations.failure` spike followed by recovery + +**Detection Query (SQL):** +```sql +-- Find version distribution +SELECT data_version, COUNT(*) as count +FROM entities +WHERE type = 'player' +GROUP BY data_version +ORDER BY data_version; + +-- Find records still at old version +SELECT id, data_version, updated_at +FROM entities +WHERE type = 'player' + AND data_version < 200 +ORDER BY updated_at DESC; +``` + +**Detection Query (MongoDB):** +```javascript +// Version distribution +db.entities.aggregate([ + { $match: { type: "player" } }, + { $group: { _id: "$dataVersion", count: { $sum: 1 } } }, + { $sort: { _id: 1 } } +]); + +// Records at old version +db.entities.find({ + type: "player", + dataVersion: { $lt: 200 } +}).sort({ updatedAt: -1 }); +``` + +### Recovery Option 1: Retry Failed Records + +Best for: Small number of failures, transient errors. + +```java +public class MigrationRetryService { + + private final AetherDataFixer fixer; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public void retryFailedRecords(List failedIds, int targetVersion) { + int success = 0; + int failed = 0; + + for (String id : failedIds) { + try { + // Load record + Dynamic data = loadRecord(id); + int currentVersion = extractVersion(data); + + // Skip if already migrated + if (currentVersion >= targetVersion) { + logger.info("Record {} already at version {}", id, currentVersion); + continue; + } + + // Enable diagnostics for retry + DiagnosticContext ctx = DiagnosticContext.create( + DiagnosticOptions.builder() + .captureSnapshots(true) + .build() + ); + + // Retry migration + Dynamic result = fixer.update( + TypeReferences.PLAYER, + data, + new DataVersion(currentVersion), + new DataVersion(targetVersion), + ctx + ); + + // Save result + saveRecord(id, result); + success++; + + } catch (DataFixerException e) { + failed++; + logger.error("Retry failed for record {}: {} [{}]", + id, e.getMessage(), e.getContext()); + } + } + + logger.info("Retry complete: {} success, {} failed", success, failed); + } +} +``` + +### Recovery Option 2: Isolate and Skip + +Best for: Specific data patterns causing failures. + +```java +public class MigrationIsolationService { + + public void migrateWithIsolation(Stream> records, int targetVersion) { + List quarantined = new ArrayList<>(); + + records.forEach(data -> { + String id = extractId(data); + try { + Dynamic result = fixer.update( + TypeReferences.PLAYER, + data, + extractVersion(data), + targetVersion + ); + saveRecord(id, result); + } catch (DataFixerException e) { + // Quarantine failed record + quarantined.add(id); + saveToQuarantine(id, data, e); + logger.warn("Quarantined record {}: {}", id, e.getMessage()); + } + }); + + if (!quarantined.isEmpty()) { + logger.warn("Migration complete with {} quarantined records", quarantined.size()); + notifyTeam(quarantined); + } + } + + private void saveToQuarantine(String id, Dynamic data, DataFixerException e) { + // Save to quarantine table/collection for manual review + QuarantineRecord record = new QuarantineRecord( + id, + serializeToJson(data), + e.getClass().getSimpleName(), + e.getMessage(), + e.getContext(), + Instant.now() + ); + quarantineRepository.save(record); + } +} +``` + +### Recovery Option 3: Manual Intervention + +Best for: Complex data issues requiring human judgment. + +```java +public class ManualRecoveryService { + + public void exportForManualReview(List recordIds) { + Path exportDir = Path.of("manual_review", + LocalDate.now().toString()); + Files.createDirectories(exportDir); + + for (String id : recordIds) { + Dynamic data = loadRecord(id); + + // Export with metadata + Map export = new LinkedHashMap<>(); + export.put("id", id); + export.put("currentVersion", extractVersion(data)); + export.put("targetVersion", CURRENT_VERSION); + export.put("data", data.getValue()); + export.put("exportedAt", Instant.now().toString()); + + Path file = exportDir.resolve(id + ".json"); + Files.writeString(file, prettyJson(export)); + } + + logger.info("Exported {} records to {} for manual review", + recordIds.size(), exportDir); + } + + public void importManualFixes(Path fixesDir) { + try (Stream files = Files.list(fixesDir)) { + files.filter(p -> p.toString().endsWith(".json")) + .forEach(file -> { + try { + Map fixed = parseJson(Files.readString(file)); + String id = (String) fixed.get("id"); + Object data = fixed.get("data"); + int version = ((Number) fixed.get("fixedVersion")).intValue(); + + saveRecord(id, createDynamic(data, version)); + logger.info("Imported manual fix for record {}", id); + } catch (Exception e) { + logger.error("Failed to import {}: {}", file, e.getMessage()); + } + }); + } + } +} +``` + +--- + +## Rollback Strategies + +### Important: Forward-Only Design + +Aether Datafixers is designed for **forward migration only**. True rollback requires: + +1. **Restore from backup** (recommended) +2. **Write compensating fixes** (complex, not recommended) + +### Restore from Backup + +**Full Restore:** + +```bash +# PostgreSQL +pg_restore -d mydb backup_v100_20240115.dump + +# MongoDB +mongorestore --db mydb ./backup_v100_20240115/mydb +``` + +**Selective Restore (specific records):** + +```sql +-- PostgreSQL: Restore specific records from backup +-- 1. Restore backup to temporary schema +CREATE SCHEMA backup_restore; +pg_restore -d mydb -n backup_restore backup_v100.dump + +-- 2. Copy specific records +INSERT INTO entities (id, type, data, data_version) +SELECT id, type, data, data_version +FROM backup_restore.entities +WHERE id IN ('record1', 'record2', 'record3') +ON CONFLICT (id) DO UPDATE +SET data = EXCLUDED.data, data_version = EXCLUDED.data_version; + +-- 3. Clean up +DROP SCHEMA backup_restore CASCADE; +``` + +### Compensating Fixes (Advanced) + +Only use when backup is unavailable and you understand the exact transformations to reverse. + +```java +// Example: Reverse a field rename (name -> displayName back to name) +public class ReverseRenameDisplayNameFix extends SchemaDataFix { + + public ReverseRenameDisplayNameFix(Schema inputSchema, Schema outputSchema) { + super("reverse_rename_display_name", inputSchema, outputSchema); + } + + @Override + protected TypeRewriteRule makeRule(Schema inputSchema, Schema outputSchema) { + return Rules.renameField( + TypeReferences.PLAYER, + "displayName", // current name + "name" // original name + ); + } +} +``` + +**Warning:** Compensating fixes are error-prone. Prefer backup restoration. + +--- + +## Error Recovery Workflows + +### Workflow 1: FixException Recovery + +``` +1. Extract exception context + └─ Get fixName, fromVersion, toVersion, typeReference + +2. Enable DiagnosticContext + └─ captureSnapshots(true), captureRuleDetails(true) + +3. Reproduce with single record + └─ Run migration on isolated test record + +4. Analyze MigrationReport + └─ Check fix.beforeSnapshot vs fix.afterSnapshot + └─ Find exact rule that failed + +5. Identify root cause + ├─ Missing field? → Check input data + ├─ Wrong type? → Check codec/schema + └─ Logic error? → Check fix implementation + +6. Fix data or code + ├─ Data issue → Clean/transform data + └─ Code issue → Deploy fix, redeploy + +7. Retry migration + └─ Process failed records +``` + +### Workflow 2: DecodeException Recovery + +``` +1. Get path from exception + └─ e.getPath() returns "player.inventory[0].item" + +2. Navigate to problematic field + └─ Use path to find exact location in data + +3. Determine expected vs actual type + └─ Check schema definition + └─ Compare with actual data + +4. Clean/transform data + ├─ Missing field? → Add default value + ├─ Wrong type? → Convert or remove + └─ Malformed? → Parse and fix + +5. Retry migration +``` + +### Workflow 3: RegistryException Recovery + +``` +1. Check missing type/version + └─ e.getMissingType() or e.getMissingVersion() + +2. Verify bootstrap registration + └─ Check DataFixerBootstrap implementation + +3. Check version chain completeness + └─ Ensure no gaps in version sequence + +4. Add missing registrations + └─ Register missing type or schema + +5. Redeploy and retry +``` + +--- + +## Incident Response + +### Severity Levels + +| Level | Criteria | Response Time | Escalation | +|-------|----------|---------------|------------| +| P1 | All migrations failing | Immediate | On-call + Lead + Manager | +| P2 | > 5% failure rate | 15 min | On-call + Lead | +| P3 | > 1% failure rate | 1 hour | On-call | +| P4 | Isolated failures | 4 hours | Next business day | + +### Incident Response Checklist + +#### Initial Response (0-5 min) + +```markdown +## Initial Response Checklist + +- [ ] Acknowledge alert +- [ ] Check metrics dashboard + - Current failure rate + - Error type breakdown + - Affected domains +- [ ] Review recent deployments (last 24h) +- [ ] Check actuator health endpoint +- [ ] Initial assessment posted to incident channel +``` + +#### Investigation (5-30 min) + +```markdown +## Investigation Checklist + +- [ ] Enable DEBUG logging for de.splatgames.aether.datafixers +- [ ] Capture sample failures (3-5 records) +- [ ] Enable DiagnosticContext on sample records +- [ ] Analyze MigrationReport for patterns +- [ ] Check database/storage health +- [ ] Check upstream service health +- [ ] Root cause hypothesis documented +``` + +#### Resolution + +```markdown +## Resolution Checklist + +- [ ] Root cause confirmed +- [ ] Fix identified + - [ ] Data fix (transformation/cleanup) + - [ ] Code fix (bug fix) + - [ ] Configuration fix (settings change) +- [ ] Fix tested in staging +- [ ] Fix deployed to production +- [ ] Metrics returning to normal +- [ ] Failed records reprocessed +``` + +#### Post-Incident + +```markdown +## Post-Incident Checklist + +- [ ] Timeline documented +- [ ] Root cause analysis complete +- [ ] Post-mortem scheduled (within 48h) +- [ ] Runbook updated if needed +- [ ] Preventive measures identified +- [ ] Follow-up tasks created +``` + +--- + +## Data Validation After Recovery + +### Consistency Checks + +```sql +-- Check for version consistency +SELECT + type, + MIN(data_version) as min_version, + MAX(data_version) as max_version, + COUNT(*) as count +FROM entities +GROUP BY type; + +-- Check for orphaned references +SELECT e.id, e.type +FROM entities e +LEFT JOIN entities parent ON e.parent_id = parent.id +WHERE e.parent_id IS NOT NULL AND parent.id IS NULL; +``` + +### Version Alignment + +```java +public void verifyVersionAlignment(int expectedVersion) { + // Count records at wrong version + long wrongVersion = entityRepository.countByDataVersionNot(expectedVersion); + + if (wrongVersion > 0) { + logger.error("Found {} records at wrong version (expected {})", + wrongVersion, expectedVersion); + + // List samples + List samples = entityRepository + .findByDataVersionNot(expectedVersion, PageRequest.of(0, 10)); + + for (Entity e : samples) { + logger.error(" {} at version {} (expected {})", + e.getId(), e.getDataVersion(), expectedVersion); + } + } else { + logger.info("All records at expected version {}", expectedVersion); + } +} +``` + +### Functional Verification + +```java +@Test +void verifyMigrationSuccess() { + // Load sample migrated records + List samples = entityRepository.findRandomSample(100); + + for (Entity entity : samples) { + // Verify can decode at current version + assertDoesNotThrow(() -> { + Typed typed = fixer.decode( + CURRENT_VERSION, + entity.getTypeReference(), + entity.getData() + ); + assertNotNull(typed.getValue()); + }, "Failed to decode entity " + entity.getId()); + + // Verify key fields present + Dynamic data = entity.getData(); + assertTrue(data.get("id").asString().result().isPresent()); + assertTrue(data.get("_version").asNumber().result().isPresent()); + } +} +``` + +--- + +## Related + +- [Error Scenarios](error-scenarios.md) — Exception handling reference +- [Debugging Guide](debugging-guide.md) — Diagnosing issues +- [Monitoring & Alerting](monitoring-alerting.md) — Detecting problems +- [Troubleshooting](../troubleshooting/index.md) — Quick fixes diff --git a/docs/troubleshooting/index.md b/docs/troubleshooting/index.md index 1012d62..98544fa 100644 --- a/docs/troubleshooting/index.md +++ b/docs/troubleshooting/index.md @@ -8,6 +8,12 @@ Solutions to common issues with Aether Datafixers. - [Debugging Tips](debugging-tips.md) — Strategies for finding issues - [FAQ](faq.md) — Frequently asked questions +## Operations Runbook + +For production operations, incident response, and recovery procedures, see the [Operations Runbook](../operations/index.md). + +--- + ## Quick Fixes ### Migration Not Applied From c6aac37fcae8a0b7ee0a3dff66458d2f0ca1b946 Mon Sep 17 00:00:00 2001 From: Erik Date: Sun, 1 Feb 2026 01:06:16 +0100 Subject: [PATCH 2/2] Introduce `structure.py` for visualizing and exporting folder structures with optional filters and exclusions. Includes support for tree views, colored output, and file pattern matching. --- scripts/structure.py | 104 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 scripts/structure.py diff --git a/scripts/structure.py b/scripts/structure.py new file mode 100644 index 0000000..fdf1d0f --- /dev/null +++ b/scripts/structure.py @@ -0,0 +1,104 @@ +import os +import fnmatch +import argparse + +def apply_color(text, color_code, use_colors): + return f"\033[{color_code}m{text}\033[0m" if use_colors else text + +def print_structure(dir_path, depth, file_pattern, use_colors, excluded_folders): + try: + entries = os.listdir(dir_path) + except Exception: + return + + if not entries: + return + + full_paths = [os.path.join(dir_path, entry) for entry in entries] + full_paths.sort(key=lambda p: os.path.isfile(p)) + + for path in full_paths: + name = os.path.basename(path) + if name in {".", ".."} or name in excluded_folders: + continue + indent = " " * (depth * 4) + if os.path.isdir(path): + print(f"{indent}" + apply_color(f"📁 {name}", "34", use_colors)) + print_structure(path, depth + 1, file_pattern, use_colors, excluded_folders) + elif fnmatch.fnmatch(name, file_pattern): + print(f"{indent} " + apply_color(f"📄 {name}", "32", use_colors)) + +def print_dependency_tree(dir_path, depth=0, file_pattern="*.java", use_colors=True, excluded_folders=set()): + try: + entries = os.listdir(dir_path) + except Exception: + return + + entries.sort() + for entry in entries: + full_path = os.path.join(dir_path, entry) + if entry in excluded_folders or entry in {".", ".."}: + continue + indent = "| " * depth + "|-- " + if os.path.isdir(full_path): + print(indent + apply_color(f"{entry}/", "34", use_colors)) + print_dependency_tree(full_path, depth + 1, file_pattern, use_colors, excluded_folders) + elif fnmatch.fnmatch(entry, file_pattern): + print(indent + apply_color(entry, "32", use_colors)) + +def export_structure_to_file(output_file, mode, file_pattern, root_path, excluded_folders): + with open(output_file, "w", encoding="utf-8") as f: + def write_structure(dir_path, depth): + try: + entries = os.listdir(dir_path) + except Exception: + return + + entries.sort() + for entry in entries: + full_path = os.path.join(dir_path, entry) + if entry in excluded_folders or entry in {".", ".."}: + continue + indent = " " * depth + if os.path.isdir(full_path): + f.write(f"{indent}{entry}/\n") + write_structure(full_path, depth + 1) + elif fnmatch.fnmatch(entry, file_pattern): + f.write(f"{indent}{entry}\n") + + f.write("Package Structure:\n") + if mode == "tree": + f.write(f"ROOT ({root_path})\n") + write_structure(root_path, 1) + else: + f.write(f"ROOT ({root_path})\n") + write_structure(root_path, 1) + +def print_package_structure(root_path, mode="default", file_pattern="*.java", output_file=None, use_colors=True, excluded_folders=set()): + if not os.path.exists(root_path) or not os.path.isdir(root_path): + print("❌ Root directory does not exist: " + root_path) + return + + print(f"\n📂 Package Structure ({root_path}):") + if mode == "tree": + print("|-- ROOT") + print_dependency_tree(root_path, 1, file_pattern, use_colors, excluded_folders) + else: + print("📁 ROOT") + print_structure(root_path, 1, file_pattern, use_colors, excluded_folders) + + if output_file: + export_structure_to_file(output_file, mode, file_pattern, root_path, excluded_folders) + print(f"\n✅ Structure exported to {output_file}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Print the package structure of a Java project.") + parser.add_argument("root_path", type=str, help="Root directory of the project") + parser.add_argument("mode", nargs="?", choices=["default", "tree"], default="default", help="Output mode") + parser.add_argument("--filter", type=str, default="*.java", help="Filter files by wildcard pattern (e.g., '*.java')") + parser.add_argument("--output", type=str, help="Export output to a file") + parser.add_argument("--no-color", action="store_true", help="Disable colored output") + parser.add_argument("--exclude", type=str, nargs="*", default=["target", "build", ".git", "node_modules"], help="Folders to exclude") + + args = parser.parse_args() + print_package_structure(args.root_path, args.mode, args.filter, args.output, not args.no_color, set(args.exclude))