diff --git a/docs/operations/debugging-guide.md b/docs/operations/debugging-guide.md
new file mode 100644
index 0000000..ae8a0f6
--- /dev/null
+++ b/docs/operations/debugging-guide.md
@@ -0,0 +1,502 @@
+# Debugging Guide
+
+Systematic approach to diagnosing migration issues in Aether Datafixers.
+
+## Quick Reference
+
+| Need | Tool | Configuration |
+|--------------------|---------------------|----------------------------|
+| Basic logs | SLF4J | Set level to DEBUG |
+| Detailed trace | `DiagnosticContext` | Enable with options |
+| Per-fix snapshots | `DiagnosticOptions` | `captureSnapshots(true)` |
+| Rule-level detail | `DiagnosticOptions` | `captureRuleDetails(true)` |
+| Production minimal | `DiagnosticOptions` | `minimal()` preset |
+
+---
+
+## SLF4J Configuration
+
+### Default Logger Name
+
+The default logger name for Aether Datafixers is:
+
+```
+de.splatgames.aether.datafixers
+```
+
+### Using Slf4jDataFixerContext
+
+Route datafixer logs through your application's logging framework:
+
+```java
+import de.splatgames.aether.datafixers.core.fix.Slf4jDataFixerContext;
+
+// Option 1: Default logger name
+DataFixerContext context = new Slf4jDataFixerContext();
+
+// Option 2: Custom logger name
+DataFixerContext context = new Slf4jDataFixerContext("com.myapp.migrations");
+
+// Option 3: Existing logger
+Logger logger = LoggerFactory.getLogger(MyMigrationService.class);
+DataFixerContext context = new Slf4jDataFixerContext(logger);
+
+// Use in migration
+fixer.update(typeRef, data, fromVersion, toVersion, context);
+```
+
+### Logback Configuration
+
+```xml
+
+
+
+
+
+ %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
+
+
+
+
+
+ logs/migrations.log
+
+ logs/migrations.%d{yyyy-MM-dd}.log
+ 30
+
+
+ %d{ISO8601} [%thread] %-5level %logger{36} - %msg%n%ex{full}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+```
+
+### Log4j2 Configuration
+
+```xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+```
+
+### Production vs Development Settings
+
+| Environment | Logger Level | Snapshots | Rule Details |
+|-------------|--------------|-----------|--------------|
+| Development | DEBUG | Yes | Yes |
+| Staging | INFO | Yes | No |
+| Production | WARN | No | No |
+
+---
+
+## MigrationReport Diagnostics
+
+### Enabling Diagnostics
+
+```java
+import de.splatgames.aether.datafixers.api.diagnostic.DiagnosticContext;
+import de.splatgames.aether.datafixers.api.diagnostic.DiagnosticOptions;
+import de.splatgames.aether.datafixers.api.diagnostic.MigrationReport;
+
+// Full diagnostics for debugging
+DiagnosticContext context = DiagnosticContext.create(
+ DiagnosticOptions.builder()
+ .captureSnapshots(true)
+ .captureRuleDetails(true)
+ .prettyPrintSnapshots(true)
+ .build()
+);
+
+// Run migration
+Dynamic> result = fixer.update(typeRef, data, fromVersion, toVersion, context);
+
+// Get the report
+MigrationReport report = context.getReport();
+```
+
+### Report Fields Reference
+
+| Field | Method | Description | When to Use |
+|-----------------|--------------------------|-------------------------|-----------------------|
+| Type | `type()` | TypeReference migrated | Always |
+| From Version | `fromVersion()` | Source version | Always |
+| To Version | `toVersion()` | Target version | Always |
+| Duration | `totalDuration()` | Total migration time | Performance issues |
+| Fix Count | `fixCount()` | Number of fixes applied | Verify migration path |
+| Fix Executions | `fixExecutions()` | Detailed fix list | Tracing issues |
+| Rule Count | `ruleApplicationCount()` | Total rules evaluated | Deep debugging |
+| Touched Types | `touchedTypes()` | All types processed | Complex migrations |
+| Warnings | `warnings()` | Non-fatal issues | Data quality |
+| Input Snapshot | `inputSnapshot()` | Data before migration | Transform debugging |
+| Output Snapshot | `outputSnapshot()` | Data after migration | Transform debugging |
+
+### Reading the Report
+
+```java
+MigrationReport report = context.getReport();
+
+// Basic summary
+System.out.println(report.toSummary());
+// Output: "Migration of 'player' from v100 to v200: 150ms, 5 fixes"
+
+// Detailed analysis
+System.out.println("Type: " + report.type().getId());
+System.out.println("Version: " + report.fromVersion().getVersion() +
+ " -> " + report.toVersion().getVersion());
+System.out.println("Duration: " + report.totalDuration().toMillis() + "ms");
+System.out.println("Fixes Applied: " + report.fixCount());
+System.out.println("Rules Evaluated: " + report.ruleApplicationCount());
+
+// Check for warnings
+if (report.hasWarnings()) {
+ System.out.println("Warnings:");
+ for (String warning : report.warnings()) {
+ System.out.println(" - " + warning);
+ }
+}
+
+// Snapshots (if enabled)
+report.inputSnapshot().ifPresent(snap ->
+ System.out.println("Input:\n" + snap));
+report.outputSnapshot().ifPresent(snap ->
+ System.out.println("Output:\n" + snap));
+```
+
+---
+
+## Tracing Fix Order
+
+### Understanding Fix Application
+
+Fixes are applied in version order, from `fromVersion` to `toVersion`. Each fix transforms data from one version to the next.
+
+```
+v100 ──[Fix A]──> v110 ──[Fix B]──> v150 ──[Fix C]──> v200
+```
+
+### Listing Applied Fixes
+
+```java
+MigrationReport report = context.getReport();
+
+System.out.println("Applied fixes in order:");
+for (FixExecution fix : report.fixExecutions()) {
+ System.out.println(fix.toSummary());
+ // Output: "rename_field (v100 -> v110): 5ms, 3 rules (2 matched)"
+}
+```
+
+### Detailed Fix Analysis
+
+```java
+for (FixExecution fix : report.fixExecutions()) {
+ System.out.println("\nFix: " + fix.fixName());
+ System.out.println(" Version: " + fix.fromVersion().getVersion() +
+ " -> " + fix.toVersion().getVersion());
+ System.out.println(" Duration: " + fix.durationMillis() + "ms");
+ System.out.println(" Rules: " + fix.ruleCount() +
+ " (" + fix.matchedRuleCount() + " matched)");
+
+ // Per-fix snapshots
+ fix.beforeSnapshotOpt().ifPresent(snap ->
+ System.out.println(" Before: " + snap));
+ fix.afterSnapshotOpt().ifPresent(snap ->
+ System.out.println(" After: " + snap));
+
+ // Rule-level details (if captureRuleDetails enabled)
+ for (RuleApplication rule : fix.ruleApplications()) {
+ System.out.println(" Rule: " + rule.ruleName() +
+ " on " + rule.typeName() +
+ " -> " + (rule.matched() ? "MATCHED" : "skipped") +
+ " (" + rule.durationMillis() + "ms)");
+ }
+}
+```
+
+### Finding a Specific Fix
+
+```java
+// Find by name
+Optional fix = report.fixExecutions().stream()
+ .filter(f -> f.fixName().equals("rename_player_field"))
+ .findFirst();
+
+// Find by version
+Optional fixAtVersion = report.fixExecutions().stream()
+ .filter(f -> f.fromVersion().getVersion() == 150)
+ .findFirst();
+```
+
+---
+
+## DiagnosticOptions
+
+### Available Presets
+
+| Preset | Snapshots | Rule Details | Pretty Print | Use Case |
+|--------------|-----------|--------------|--------------|-----------------------|
+| `defaults()` | Yes | Yes | Yes | Development debugging |
+| `minimal()` | No | No | No | Production monitoring |
+
+```java
+// Full diagnostics (development)
+DiagnosticContext devContext = DiagnosticContext.create(DiagnosticOptions.defaults());
+
+// Minimal overhead (production)
+DiagnosticContext prodContext = DiagnosticContext.create(DiagnosticOptions.minimal());
+
+// No diagnostics (maximum performance)
+fixer.update(typeRef, data, fromVersion, toVersion); // No context
+```
+
+### Custom Configuration
+
+```java
+DiagnosticOptions options = DiagnosticOptions.builder()
+ .captureSnapshots(true) // Enable before/after snapshots
+ .captureRuleDetails(true) // Enable per-rule tracking
+ .maxSnapshotLength(10000) // Truncate large snapshots (0 = unlimited)
+ .prettyPrintSnapshots(true) // Format JSON for readability
+ .build();
+```
+
+### Snapshot Truncation
+
+Large data structures are truncated to prevent memory issues:
+
+```java
+DiagnosticOptions options = DiagnosticOptions.builder()
+ .captureSnapshots(true)
+ .maxSnapshotLength(500) // Truncate to 500 characters
+ .build();
+
+// Truncated snapshots end with "... (truncated)"
+```
+
+---
+
+## Step-by-Step Debugging Workflow
+
+### 1. Reproduce the Issue
+
+```java
+// Isolate a single problematic record
+Dynamic> problematicData = loadProblemRecord();
+DataVersion fromVersion = new DataVersion(100);
+DataVersion toVersion = new DataVersion(200);
+```
+
+### 2. Enable Full Diagnostics
+
+```java
+DiagnosticContext context = DiagnosticContext.create(
+ DiagnosticOptions.builder()
+ .captureSnapshots(true)
+ .captureRuleDetails(true)
+ .prettyPrintSnapshots(true)
+ .build()
+);
+```
+
+### 3. Run Migration with Diagnostics
+
+```java
+try {
+ Dynamic> result = fixer.update(typeRef, problematicData, fromVersion, toVersion, context);
+ System.out.println("Migration succeeded");
+} catch (DataFixerException e) {
+ System.err.println("Migration failed: " + e.getMessage());
+} finally {
+ // Always get the report (even on failure)
+ MigrationReport report = context.getReport();
+ analyzeReport(report);
+}
+```
+
+### 4. Analyze the Report
+
+```java
+private void analyzeReport(MigrationReport report) {
+ System.out.println("\n=== Migration Report ===");
+ System.out.println(report.toSummary());
+
+ // Check warnings
+ if (report.hasWarnings()) {
+ System.out.println("\nWarnings:");
+ report.warnings().forEach(w -> System.out.println(" - " + w));
+ }
+
+ // Find slow fixes
+ System.out.println("\nFix timing:");
+ report.fixExecutions().stream()
+ .sorted((a, b) -> Long.compare(b.durationMillis(), a.durationMillis()))
+ .forEach(fix -> System.out.println(" " + fix.fixName() + ": " + fix.durationMillis() + "ms"));
+
+ // Check for unmatched rules
+ long unmatchedRules = report.fixExecutions().stream()
+ .flatMap(fix -> fix.ruleApplications().stream())
+ .filter(rule -> !rule.matched())
+ .count();
+ System.out.println("\nUnmatched rules: " + unmatchedRules);
+}
+```
+
+### 5. Examine Snapshots
+
+```java
+// Compare before/after for the failing fix
+for (FixExecution fix : report.fixExecutions()) {
+ System.out.println("\n--- " + fix.fixName() + " ---");
+
+ fix.beforeSnapshotOpt().ifPresent(before -> {
+ System.out.println("BEFORE:");
+ System.out.println(before);
+ });
+
+ fix.afterSnapshotOpt().ifPresent(after -> {
+ System.out.println("AFTER:");
+ System.out.println(after);
+ });
+}
+```
+
+---
+
+## Spring Boot Integration
+
+### Diagnostics via MigrationService
+
+```java
+@Autowired
+private MigrationService migrationService;
+
+public void migrateWithDiagnostics(TaggedDynamic> data) {
+ DiagnosticContext context = DiagnosticContext.create(DiagnosticOptions.defaults());
+
+ MigrationResult result = migrationService
+ .migrate(data)
+ .from(100)
+ .to(200)
+ .withContext(context)
+ .execute();
+
+ // Analyze diagnostics
+ MigrationReport report = context.getReport();
+ logReport(report);
+}
+
+private void logReport(MigrationReport report) {
+ logger.info("Migration: {}", report.toSummary());
+
+ for (String warning : report.warnings()) {
+ logger.warn(" Warning: {}", warning);
+ }
+
+ for (FixExecution fix : report.fixExecutions()) {
+ logger.debug(" Fix '{}': {}ms, {} rules ({} matched)",
+ fix.fixName(),
+ fix.durationMillis(),
+ fix.ruleCount(),
+ fix.matchedRuleCount());
+ }
+}
+```
+
+### Conditional Diagnostics in Production
+
+```java
+@Value("${aether.datafixers.diagnostics.enabled:false}")
+private boolean diagnosticsEnabled;
+
+public MigrationResult migrate(TaggedDynamic> data) {
+ MigrationService.MigrationBuilder builder = migrationService
+ .migrate(data)
+ .from(100)
+ .to(200);
+
+ if (diagnosticsEnabled) {
+ DiagnosticContext context = DiagnosticContext.create(DiagnosticOptions.minimal());
+ builder.withContext(context);
+ }
+
+ return builder.execute();
+}
+```
+
+---
+
+## Common Debugging Scenarios
+
+### Scenario: Migration Produces Wrong Output
+
+1. Enable snapshots
+2. Compare `inputSnapshot` with `outputSnapshot`
+3. Check each fix's before/after snapshots
+4. Identify which fix introduced the problem
+
+### Scenario: Migration is Slow
+
+1. Enable `DiagnosticOptions.minimal()` (low overhead)
+2. Check `report.totalDuration()`
+3. Sort fixes by duration
+4. Profile the slowest fix
+
+```java
+report.fixExecutions().stream()
+ .sorted((a, b) -> Long.compare(b.durationMillis(), a.durationMillis()))
+ .limit(5)
+ .forEach(fix -> System.out.println(fix.fixName() + ": " + fix.durationMillis() + "ms"));
+```
+
+### Scenario: Warning During Migration
+
+1. Check `report.warnings()`
+2. Enable rule details to see which rule emitted the warning
+3. Review the fix implementation for `context.warn()` calls
+
+---
+
+## Related
+
+- [Error Scenarios](error-scenarios.md) — Exception handling reference
+- [How to Use Diagnostics](../how-to/use-diagnostics.md) — Full API reference
+- [How to Debug Migrations](../how-to/debug-migrations.md) — Basic debugging tips
+- [Monitoring & Alerting](monitoring-alerting.md) — Production monitoring
diff --git a/docs/operations/error-scenarios.md b/docs/operations/error-scenarios.md
new file mode 100644
index 0000000..2613a62
--- /dev/null
+++ b/docs/operations/error-scenarios.md
@@ -0,0 +1,459 @@
+# Error Scenarios
+
+Detailed guide to handling exceptions in Aether Datafixers production environments.
+
+## Exception Hierarchy Quick Reference
+
+| Exception | Context Fields | Common Causes |
+|----------------------|--------------------------------------------------------|---------------------------|
+| `DataFixerException` | `context` | Base class for all errors |
+| `FixException` | `fixName`, `fromVersion`, `toVersion`, `typeReference` | Fix logic failure |
+| `DecodeException` | `typeReference`, `path` | Invalid input data |
+| `EncodeException` | `typeReference`, `failedValue` | Serialization failure |
+| `RegistryException` | `missingType`, `missingVersion` | Missing registration |
+
+All exceptions extend `RuntimeException` (unchecked) and are immutable/thread-safe.
+
+---
+
+## FixException
+
+Thrown when a DataFix fails to transform data from one version to another.
+
+### Context Fields
+
+| Field | Accessor | Description |
+|-----------------|----------------------|-----------------------------|
+| `fixName` | `getFixName()` | Name of the fix that failed |
+| `fromVersion` | `getFromVersion()` | Source version of migration |
+| `toVersion` | `getToVersion()` | Target version of migration |
+| `typeReference` | `getTypeReference()` | Type being transformed |
+
+### Context String Format
+
+```
+fix=rename_player_name, version=100->200, type=player
+```
+
+### Common Causes
+
+1. **Invalid input data** — Data doesn't match expected schema
+2. **Missing required field** — Fix expects a field that doesn't exist
+3. **Type mismatch** — Expected string but found number
+4. **Rule application failure** — TypeRewriteRule failed to apply
+5. **Null pointer** — Fix logic encountered null unexpectedly
+
+### Resolution Steps
+
+```java
+try {
+ Dynamic> result = fixer.update(typeRef, data, fromVersion, toVersion);
+} catch (FixException e) {
+ // 1. Log with full context
+ logger.error("Migration failed: {} [{}]", e.getMessage(), e.getContext());
+
+ // 2. Extract specific fields for analysis
+ if (e.getFixName() != null) {
+ logger.error(" Fix: {}", e.getFixName());
+ }
+ if (e.getFromVersion() != null && e.getToVersion() != null) {
+ logger.error(" Version: {} -> {}",
+ e.getFromVersion().getVersion(),
+ e.getToVersion().getVersion());
+ }
+ if (e.getTypeReference() != null) {
+ logger.error(" Type: {}", e.getTypeReference().getId());
+ }
+
+ // 3. Check root cause
+ if (e.getCause() != null) {
+ logger.error(" Root cause: {}", e.getCause().getMessage());
+ }
+}
+```
+
+### Diagnostic Integration
+
+```java
+// Use DiagnosticContext to capture snapshots
+DiagnosticContext ctx = DiagnosticContext.create(
+ DiagnosticOptions.builder()
+ .captureSnapshots(true)
+ .build()
+);
+
+try {
+ fixer.update(typeRef, data, fromVersion, toVersion, ctx);
+} catch (FixException e) {
+ MigrationReport report = ctx.getReport();
+
+ // Find which fix ran last (the one that failed)
+ List fixes = report.fixExecutions();
+ if (!fixes.isEmpty()) {
+ FixExecution lastFix = fixes.get(fixes.size() - 1);
+ logger.error("Last fix before failure: {}", lastFix.fixName());
+ lastFix.beforeSnapshotOpt().ifPresent(snap ->
+ logger.error("Data before fix: {}", snap));
+ }
+}
+```
+
+---
+
+## DecodeException
+
+Thrown when deserialization from Dynamic to typed Java object fails.
+
+### Context Fields
+
+| Field | Accessor | Description |
+|-----------------|----------------------|-------------------------------------------|
+| `typeReference` | `getTypeReference()` | Type being decoded |
+| `path` | `getPath()` | Location in data structure (dot notation) |
+
+### Context String Format
+
+```
+type=player, path=inventory[0].item.name
+```
+
+### Path Notation
+
+The path uses dot notation with array indices:
+- `player.name` — Field `name` in object `player`
+- `inventory[0]` — First element of array `inventory`
+- `inventory[0].item.damage` — Nested field access
+
+### Common Causes
+
+1. **Missing required field** — Schema expects field that doesn't exist
+2. **Invalid field type** — Expected number, got string
+3. **Malformed data** — Corrupt or truncated input
+4. **Schema mismatch** — Data version doesn't match expected schema
+5. **Null value** — Non-nullable field is null
+
+### Resolution Steps
+
+```java
+try {
+ Typed typed = fixer.decode(version, typeRef, dynamic);
+} catch (DecodeException e) {
+ logger.error("Decode failed: {} [{}]", e.getMessage(), e.getContext());
+
+ // Path tells you exactly where the problem is
+ if (e.getPath() != null) {
+ logger.error("Problem location: {}", e.getPath());
+
+ // Navigate to the problematic field
+ String[] pathParts = e.getPath().split("\\.");
+ Dynamic> current = dynamic;
+ for (String part : pathParts) {
+ if (part.contains("[")) {
+ // Array access
+ String fieldName = part.substring(0, part.indexOf('['));
+ int index = Integer.parseInt(
+ part.substring(part.indexOf('[') + 1, part.indexOf(']')));
+ current = current.get(fieldName).get(index);
+ } else {
+ current = current.get(part);
+ }
+ logger.debug(" {} = {}", part, current.getValue());
+ }
+ }
+}
+```
+
+### Data Inspection
+
+```java
+// Inspect the raw data at the failing path
+DecodeException e = ...;
+if (e.getPath() != null && e.getPath().contains(".")) {
+ String parentPath = e.getPath().substring(0, e.getPath().lastIndexOf('.'));
+ String fieldName = e.getPath().substring(e.getPath().lastIndexOf('.') + 1);
+
+ logger.error("Parent object fields at '{}': {}", parentPath,
+ navigateTo(dynamic, parentPath).asMap().keySet());
+}
+```
+
+---
+
+## EncodeException
+
+Thrown when serialization from Java object to Dynamic representation fails.
+
+### Context Fields
+
+| Field | Accessor | Description |
+|-----------------|----------------------|-----------------------------|
+| `typeReference` | `getTypeReference()` | Type being encoded |
+| `failedValue` | `getFailedValue()` | Value that failed to encode |
+
+### Context String Format
+
+```
+type=player
+```
+
+### Common Causes
+
+1. **Null value** — Required field is null
+2. **Unsupported type** — Codec doesn't support the value type
+3. **Codec misconfiguration** — Encoder not properly set up
+4. **Circular reference** — Object graph contains cycles
+
+### Resolution Steps
+
+```java
+try {
+ Dynamic> encoded = fixer.encode(version, typeRef, value, ops);
+} catch (EncodeException e) {
+ logger.error("Encode failed: {} [{}]", e.getMessage(), e.getContext());
+
+ // Inspect the failed value (be careful with sensitive data)
+ if (e.getFailedValue() != null) {
+ logger.error("Failed value class: {}", e.getFailedValue().getClass().getName());
+ // Only log non-sensitive values
+ if (isSafeToLog(e.getFailedValue())) {
+ logger.error("Failed value: {}", e.getFailedValue());
+ }
+ }
+
+ // Check if it's a null issue
+ if (e.getCause() instanceof NullPointerException) {
+ logger.error("Null value encountered - check required fields");
+ }
+}
+```
+
+### Sensitive Data Warning
+
+The `failedValue` may contain sensitive information (passwords, tokens, PII). Always sanitize before logging:
+
+```java
+private boolean isSafeToLog(Object value) {
+ // Don't log objects that might contain sensitive data
+ if (value instanceof String) {
+ String str = (String) value;
+ return str.length() < 100 && !str.toLowerCase().contains("password");
+ }
+ return value instanceof Number || value instanceof Boolean;
+}
+```
+
+---
+
+## RegistryException
+
+Thrown when a registry lookup fails (type, schema, or codec not found).
+
+### Context Fields
+
+| Field | Accessor | Description |
+|------------------|-----------------------|-------------------------|
+| `missingType` | `getMissingType()` | TypeReference not found |
+| `missingVersion` | `getMissingVersion()` | DataVersion not found |
+
+### Context String Format
+
+```
+type=custom_entity, version=150
+```
+
+### Common Causes
+
+1. **Type not registered** — Forgot to register type in bootstrap
+2. **Schema not registered** — Version not registered in SchemaRegistry
+3. **Version gap** — No schema exists for intermediate version
+4. **Typo in TypeReference** — Type ID doesn't match registration
+
+### Resolution Steps
+
+```java
+try {
+ Schema schema = schemaRegistry.require(version);
+} catch (RegistryException e) {
+ logger.error("Registry lookup failed: {} [{}]", e.getMessage(), e.getContext());
+
+ if (e.getMissingVersion() != null) {
+ logger.error("Missing schema for version: {}",
+ e.getMissingVersion().getVersion());
+
+ // List available versions
+ logger.info("Available versions: {}",
+ schemaRegistry.getVersions().stream()
+ .map(v -> String.valueOf(v.getVersion()))
+ .collect(Collectors.joining(", ")));
+ }
+
+ if (e.getMissingType() != null) {
+ logger.error("Missing type: {}", e.getMissingType().getId());
+
+ // List registered types (at current version if available)
+ logger.info("Registered types: {}",
+ typeRegistry.getRegisteredTypes().stream()
+ .map(TypeReference::getId)
+ .collect(Collectors.joining(", ")));
+ }
+}
+```
+
+### Bootstrap Verification Checklist
+
+When encountering RegistryException:
+
+- [ ] Check `registerSchemas()` includes the required version
+- [ ] Check type is registered in the schema for that version
+- [ ] Verify no gaps in version chain (e.g., 100 -> 200 needs fixes, not just schemas)
+- [ ] Check for typos in TypeReference IDs
+- [ ] Verify bootstrap is loaded (not null)
+
+---
+
+## Schema Mismatch Scenarios
+
+### Data Version Doesn't Match Expected
+
+**Symptom**: Migration produces unexpected results or fails silently.
+
+**Detection**:
+
+```java
+// Check data version before migration
+Optional dataVersion = dynamic.get("_version").asNumber()
+ .map(Number::intValue);
+
+if (dataVersion.isEmpty()) {
+ logger.warn("Data has no version field - assuming oldest version");
+}
+
+int fromVersion = dataVersion.orElse(OLDEST_VERSION);
+if (fromVersion > currentVersion.getVersion()) {
+ throw new IllegalStateException(
+ "Data version " + fromVersion + " is newer than current " + currentVersion);
+}
+```
+
+### Type Structure Changed Without Fix
+
+**Symptom**: Fields missing or have wrong type after migration.
+
+**Detection**:
+
+```java
+// Use SchemaValidator to detect coverage gaps
+ValidationResult result = SchemaValidator.forBootstrap(bootstrap)
+ .validateFixCoverage()
+ .validate();
+
+if (!result.isValid()) {
+ for (String error : result.getErrors()) {
+ logger.error("Schema validation error: {}", error);
+ }
+}
+```
+
+**Resolution**: Write a DataFix to handle the schema change.
+
+---
+
+## Extracting Exception Context
+
+### Complete Context Extraction
+
+```java
+public class ExceptionAnalyzer {
+
+ public static void logException(DataFixerException e) {
+ StringBuilder sb = new StringBuilder();
+ sb.append("Exception: ").append(e.getClass().getSimpleName()).append("\n");
+ sb.append("Message: ").append(e.getMessage()).append("\n");
+
+ if (e.getContext() != null) {
+ sb.append("Context: ").append(e.getContext()).append("\n");
+ }
+
+ // Type-specific extraction
+ if (e instanceof FixException fix) {
+ if (fix.getFixName() != null) {
+ sb.append("Fix Name: ").append(fix.getFixName()).append("\n");
+ }
+ if (fix.getFromVersion() != null) {
+ sb.append("From Version: ").append(fix.getFromVersion().getVersion()).append("\n");
+ }
+ if (fix.getToVersion() != null) {
+ sb.append("To Version: ").append(fix.getToVersion().getVersion()).append("\n");
+ }
+ if (fix.getTypeReference() != null) {
+ sb.append("Type: ").append(fix.getTypeReference().getId()).append("\n");
+ }
+ } else if (e instanceof DecodeException decode) {
+ if (decode.getTypeReference() != null) {
+ sb.append("Type: ").append(decode.getTypeReference().getId()).append("\n");
+ }
+ if (decode.getPath() != null) {
+ sb.append("Path: ").append(decode.getPath()).append("\n");
+ }
+ } else if (e instanceof EncodeException encode) {
+ if (encode.getTypeReference() != null) {
+ sb.append("Type: ").append(encode.getTypeReference().getId()).append("\n");
+ }
+ // Be careful with failedValue - may contain sensitive data
+ } else if (e instanceof RegistryException registry) {
+ if (registry.getMissingType() != null) {
+ sb.append("Missing Type: ").append(registry.getMissingType().getId()).append("\n");
+ }
+ if (registry.getMissingVersion() != null) {
+ sb.append("Missing Version: ").append(registry.getMissingVersion().getVersion()).append("\n");
+ }
+ }
+
+ // Root cause chain
+ Throwable cause = e.getCause();
+ int depth = 0;
+ while (cause != null && depth < 5) {
+ sb.append("Caused by: ").append(cause.getClass().getSimpleName())
+ .append(": ").append(cause.getMessage()).append("\n");
+ cause = cause.getCause();
+ depth++;
+ }
+
+ System.err.println(sb);
+ }
+}
+```
+
+### Logging Pattern for Production
+
+```xml
+
+%d{ISO8601} [%thread] %-5level %logger{36} - %msg%n%ex{full}
+```
+
+```java
+// Structured logging with MDC
+import org.slf4j.MDC;
+
+try {
+ fixer.update(typeRef, data, fromVersion, toVersion);
+} catch (FixException e) {
+ MDC.put("fix_name", e.getFixName());
+ MDC.put("from_version", String.valueOf(e.getFromVersion()));
+ MDC.put("to_version", String.valueOf(e.getToVersion()));
+ MDC.put("type", e.getTypeReference() != null ? e.getTypeReference().getId() : "unknown");
+
+ logger.error("Migration failed", e);
+
+ MDC.clear();
+}
+```
+
+---
+
+## Related
+
+- [Debugging Guide](debugging-guide.md) — Systematic debugging approach
+- [Recovery Procedures](recovery-procedures.md) — How to recover from failures
+- [Common Errors](../troubleshooting/common-errors.md) — Quick error reference
+- [How to Use Diagnostics](../how-to/use-diagnostics.md) — Diagnostic API reference
diff --git a/docs/operations/index.md b/docs/operations/index.md
new file mode 100644
index 0000000..9de9d5e
--- /dev/null
+++ b/docs/operations/index.md
@@ -0,0 +1,128 @@
+# Operations Runbook
+
+Operational guidance for running Aether Datafixers in production environments. This runbook covers error handling, debugging, monitoring, and recovery procedures.
+
+## Quick Reference
+
+| Scenario | Document | Key Actions |
+|--------------------------------|-------------------------------------------------|---------------------------------------|
+| Migration fails with exception | [Error Scenarios](error-scenarios.md) | Extract context, check exception type |
+| Need detailed migration trace | [Debugging Guide](debugging-guide.md) | Enable `DiagnosticContext` |
+| Set up production monitoring | [Monitoring & Alerting](monitoring-alerting.md) | Configure Micrometer metrics |
+| Partial migration / data loss | [Recovery Procedures](recovery-procedures.md) | Restore from backup, retry |
+
+## Documentation Structure
+
+### [Error Scenarios](error-scenarios.md)
+
+Exception handling reference for production troubleshooting:
+- Exception hierarchy and context fields
+- `FixException` — Migration logic failures
+- `DecodeException` — Deserialization failures
+- `EncodeException` — Serialization failures
+- `RegistryException` — Missing type or version
+- Schema mismatch detection and resolution
+
+### [Debugging Guide](debugging-guide.md)
+
+Systematic approach to diagnosing migration issues:
+- SLF4J configuration (Logback, Log4j2)
+- Using `MigrationReport` for diagnostics
+- Tracing fix application order
+- Step-by-step debugging workflow
+
+### [Monitoring & Alerting](monitoring-alerting.md)
+
+Production monitoring setup:
+- Micrometer metrics reference
+- Recommended alert thresholds
+- Prometheus alerting rules
+- Grafana dashboard templates
+- Actuator health integration
+
+### [Recovery Procedures](recovery-procedures.md)
+
+Handling failures and data recovery:
+- Backup recommendations
+- Partial migration recovery
+- Rollback strategies
+- Incident response workflows
+
+---
+
+## Emergency Response
+
+### Migration Completely Failed
+
+1. **Check metrics** — Look at `aether.datafixers.migrations.failure` counter
+2. **Extract context** — See [Error Scenarios](error-scenarios.md#extracting-exception-context)
+3. **Enable DEBUG** — Set log level for `de.splatgames.aether.datafixers`
+4. **Capture diagnostics** — Use `DiagnosticContext` on a sample record
+5. **Restore if needed** — See [Recovery Procedures](recovery-procedures.md)
+
+### High Failure Rate Alert
+
+1. **Check error breakdown** — Query failures by `error_type` tag
+2. **Identify pattern** — Same exception? Same data version?
+3. **Isolate bad records** — Query for records at problematic version
+4. **Apply targeted fix** — Fix data or code, retry migration
+
+### Slow Migration Alert
+
+1. **Check version span** — Large version jumps take longer
+2. **Profile fixes** — Use `MigrationReport.fixExecutions()` timing
+3. **Check data size** — Large objects slow down processing
+4. **Consider batching** — Process in smaller batches
+
+---
+
+## Health Checks
+
+### Actuator Endpoints
+
+| Endpoint | Purpose |
+|------------------------|---------------------------|
+| `/actuator/health` | UP/DOWN status per domain |
+| `/actuator/info` | Version information |
+| `/actuator/datafixers` | Detailed domain status |
+| `/actuator/prometheus` | Metrics export |
+
+### Kubernetes Probes
+
+```yaml
+livenessProbe:
+ httpGet:
+ path: /actuator/health/liveness
+ port: 8080
+ initialDelaySeconds: 30
+ periodSeconds: 10
+
+readinessProbe:
+ httpGet:
+ path: /actuator/health/readiness
+ port: 8080
+ initialDelaySeconds: 10
+ periodSeconds: 5
+```
+
+---
+
+## Key Metrics Overview
+
+| Metric | Type | Alert Threshold |
+|---------------------------------------------|--------------|-----------------|
+| `aether.datafixers.migrations.success` | Counter | — |
+| `aether.datafixers.migrations.failure` | Counter | > 0 per minute |
+| `aether.datafixers.migrations.duration` | Timer | p99 > 1s |
+| `aether.datafixers.migrations.version.span` | Distribution | avg > 50 |
+
+See [Monitoring & Alerting](monitoring-alerting.md) for complete metrics reference.
+
+---
+
+## Related
+
+- [Troubleshooting](../troubleshooting/index.md) — Basic troubleshooting tips
+- [Spring Boot Metrics](../spring-boot/metrics.md) — Detailed metrics reference
+- [Spring Boot Actuator](../spring-boot/actuator.md) — Actuator integration
+- [How to Use Diagnostics](../how-to/use-diagnostics.md) — Diagnostic API reference
diff --git a/docs/operations/monitoring-alerting.md b/docs/operations/monitoring-alerting.md
new file mode 100644
index 0000000..2c0b9ae
--- /dev/null
+++ b/docs/operations/monitoring-alerting.md
@@ -0,0 +1,646 @@
+# Monitoring & Alerting
+
+Production monitoring setup for Aether Datafixers using Micrometer, Prometheus, and Grafana.
+
+## Metric Quick Reference
+
+| Metric | Type | Tags | Alert Threshold | Description |
+|---------------------------------------------|--------------|------------------------|-----------------|-----------------------|
+| `aether.datafixers.migrations.success` | Counter | `domain` | — | Successful migrations |
+| `aether.datafixers.migrations.failure` | Counter | `domain`, `error_type` | > 0/min | Failed migrations |
+| `aether.datafixers.migrations.duration` | Timer | `domain` | p99 > 1s | Execution time |
+| `aether.datafixers.migrations.version.span` | Distribution | `domain` | avg > 50 | Version distance |
+
+All metrics use the prefix `aether.datafixers.migrations`.
+
+---
+
+## Metric Details
+
+### Success Counter
+
+Tracks total successful migrations per domain.
+
+**Prometheus format:**
+```
+aether_datafixers_migrations_success_total{domain="game"} 1234
+```
+
+**Use cases:**
+- Calculate success rate
+- Monitor throughput
+- Track migration activity
+
+### Failure Counter
+
+Tracks failed migrations with error type breakdown.
+
+**Prometheus format:**
+```
+aether_datafixers_migrations_failure_total{domain="game",error_type="FixException"} 5
+aether_datafixers_migrations_failure_total{domain="game",error_type="DecodeException"} 2
+```
+
+**Tags:**
+- `domain` — DataFixer domain name
+- `error_type` — Exception class simple name
+
+### Duration Timer
+
+Tracks execution time distribution (includes both success and failure).
+
+**Prometheus format:**
+```
+aether_datafixers_migrations_duration_seconds_count{domain="game"} 1239
+aether_datafixers_migrations_duration_seconds_sum{domain="game"} 185.7
+aether_datafixers_migrations_duration_seconds_bucket{domain="game",le="0.01"} 500
+aether_datafixers_migrations_duration_seconds_bucket{domain="game",le="0.1"} 1100
+aether_datafixers_migrations_duration_seconds_bucket{domain="game",le="1.0"} 1230
+aether_datafixers_migrations_duration_seconds_bucket{domain="game",le="+Inf"} 1239
+```
+
+### Version Span Distribution
+
+Tracks the distance between source and target versions (indicates data age).
+
+**Prometheus format:**
+```
+aether_datafixers_migrations_version_span_count{domain="game"} 1234
+aether_datafixers_migrations_version_span_sum{domain="game"} 45600
+aether_datafixers_migrations_version_span_max{domain="game"} 150
+```
+
+---
+
+## Recommended Alert Thresholds
+
+### Critical Alerts (Page On-Call)
+
+| Alert | Condition | Duration | Action |
+|------------------------|---------------------|----------|-------------------------|
+| High Failure Rate | > 5% failures | 5m | Immediate investigation |
+| All Migrations Failing | 100% failure rate | 2m | Emergency response |
+| Service Down | No metrics reported | 5m | Check service health |
+
+### Warning Alerts (Notify Team)
+
+| Alert | Condition | Duration | Action |
+|-----------------------|---------------|----------|-----------------------------------|
+| Elevated Failure Rate | > 1% failures | 5m | Investigate during business hours |
+| Slow Migrations | p95 > 1s | 5m | Performance review |
+| Very Slow Migrations | p99 > 5s | 5m | Profile and optimize |
+
+### Informational Alerts (Dashboard/Log)
+
+| Alert | Condition | Duration | Action |
+|--------------------|----------------|----------|-----------------------------|
+| Large Version Span | avg span > 50 | 1h | Review data freshness |
+| Very Large Span | max span > 200 | 1h | Identify stale data sources |
+| No Activity | 0 migrations | 1h | Verify expected behavior |
+
+---
+
+## Prometheus Alert Rules
+
+### Complete Alert Configuration
+
+```yaml
+groups:
+ - name: aether-datafixers-critical
+ rules:
+ # High failure rate - immediate attention
+ - alert: DataFixerHighFailureRate
+ expr: |
+ (
+ sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain)
+ / (
+ sum(rate(aether_datafixers_migrations_success_total[5m])) by (domain)
+ + sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain)
+ )
+ ) > 0.05
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Critical: DataFixer failure rate > 5% in domain {{ $labels.domain }}"
+ description: "Failure rate is {{ $value | humanizePercentage }}. Check error logs and metrics."
+ runbook_url: "https://docs.example.com/runbooks/datafixer-high-failure"
+
+ # All migrations failing
+ - alert: DataFixerAllFailing
+ expr: |
+ sum(rate(aether_datafixers_migrations_success_total[2m])) by (domain) == 0
+ and sum(rate(aether_datafixers_migrations_failure_total[2m])) by (domain) > 0
+ for: 2m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Critical: All migrations failing in domain {{ $labels.domain }}"
+ description: "Zero successful migrations with active failures. Immediate attention required."
+ runbook_url: "https://docs.example.com/runbooks/datafixer-total-failure"
+
+ - name: aether-datafixers-warning
+ rules:
+ # Elevated failure rate
+ - alert: DataFixerElevatedFailureRate
+ expr: |
+ (
+ sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain)
+ / (
+ sum(rate(aether_datafixers_migrations_success_total[5m])) by (domain)
+ + sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain)
+ )
+ ) > 0.01
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Warning: DataFixer failure rate > 1% in domain {{ $labels.domain }}"
+ description: "Failure rate is {{ $value | humanizePercentage }}. Investigate soon."
+
+ # Slow migrations (p95)
+ - alert: DataFixerSlowMigrations
+ expr: |
+ histogram_quantile(0.95,
+ sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain)
+ ) > 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Warning: Slow migrations in domain {{ $labels.domain }}"
+ description: "p95 migration duration is {{ $value | humanizeDuration }}. Review performance."
+
+ # Very slow migrations (p99)
+ - alert: DataFixerVerySlowMigrations
+ expr: |
+ histogram_quantile(0.99,
+ sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain)
+ ) > 5
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Warning: Very slow migrations in domain {{ $labels.domain }}"
+ description: "p99 migration duration is {{ $value | humanizeDuration }}. Profile and optimize."
+
+ - name: aether-datafixers-info
+ rules:
+ # Large version span
+ - alert: DataFixerLargeVersionSpan
+ expr: |
+ (
+ rate(aether_datafixers_migrations_version_span_sum[1h])
+ / rate(aether_datafixers_migrations_version_span_count[1h])
+ ) > 50
+ for: 1h
+ labels:
+ severity: info
+ annotations:
+ summary: "Info: Large version span in domain {{ $labels.domain }}"
+ description: "Average span is {{ $value }} versions. Consider data freshness review."
+
+ # No migration activity
+ - alert: DataFixerNoActivity
+ expr: |
+ sum(rate(aether_datafixers_migrations_success_total[1h])) by (domain) == 0
+ and sum(rate(aether_datafixers_migrations_failure_total[1h])) by (domain) == 0
+ for: 1h
+ labels:
+ severity: info
+ annotations:
+ summary: "Info: No migration activity in domain {{ $labels.domain }}"
+ description: "No migrations in the last hour. Verify this is expected."
+```
+
+---
+
+## Grafana Dashboard
+
+### Complete Dashboard JSON
+
+```json
+{
+ "title": "Aether DataFixers Operations",
+ "uid": "aether-datafixers-ops",
+ "tags": ["aether", "datafixers", "migrations"],
+ "timezone": "browser",
+ "refresh": "30s",
+ "panels": [
+ {
+ "title": "Migration Rate",
+ "type": "timeseries",
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+ "targets": [
+ {
+ "expr": "sum(rate(aether_datafixers_migrations_success_total[5m])) by (domain)",
+ "legendFormat": "Success ({{domain}})"
+ },
+ {
+ "expr": "sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain)",
+ "legendFormat": "Failure ({{domain}})"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops"
+ }
+ }
+ },
+ {
+ "title": "Success Rate",
+ "type": "gauge",
+ "gridPos": {"h": 8, "w": 6, "x": 12, "y": 0},
+ "targets": [
+ {
+ "expr": "(sum(rate(aether_datafixers_migrations_success_total[1h])) / (sum(rate(aether_datafixers_migrations_success_total[1h])) + sum(rate(aether_datafixers_migrations_failure_total[1h])))) * 100"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percent",
+ "min": 0,
+ "max": 100,
+ "thresholds": {
+ "steps": [
+ {"color": "red", "value": 0},
+ {"color": "yellow", "value": 95},
+ {"color": "green", "value": 99}
+ ]
+ }
+ }
+ }
+ },
+ {
+ "title": "Current Error Rate",
+ "type": "stat",
+ "gridPos": {"h": 8, "w": 6, "x": 18, "y": 0},
+ "targets": [
+ {
+ "expr": "sum(rate(aether_datafixers_migrations_failure_total[5m])) / (sum(rate(aether_datafixers_migrations_success_total[5m])) + sum(rate(aether_datafixers_migrations_failure_total[5m]))) * 100"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "percent",
+ "thresholds": {
+ "steps": [
+ {"color": "green", "value": 0},
+ {"color": "yellow", "value": 1},
+ {"color": "red", "value": 5}
+ ]
+ }
+ }
+ }
+ },
+ {
+ "title": "Migration Duration Percentiles",
+ "type": "timeseries",
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
+ "targets": [
+ {
+ "expr": "histogram_quantile(0.50, sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain))",
+ "legendFormat": "p50 ({{domain}})"
+ },
+ {
+ "expr": "histogram_quantile(0.95, sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain))",
+ "legendFormat": "p95 ({{domain}})"
+ },
+ {
+ "expr": "histogram_quantile(0.99, sum(rate(aether_datafixers_migrations_duration_seconds_bucket[5m])) by (le, domain))",
+ "legendFormat": "p99 ({{domain}})"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "s"
+ }
+ }
+ },
+ {
+ "title": "Version Span Distribution",
+ "type": "timeseries",
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
+ "targets": [
+ {
+ "expr": "rate(aether_datafixers_migrations_version_span_sum[5m]) / rate(aether_datafixers_migrations_version_span_count[5m])",
+ "legendFormat": "Avg Span ({{domain}})"
+ },
+ {
+ "expr": "aether_datafixers_migrations_version_span_max",
+ "legendFormat": "Max Span ({{domain}})"
+ }
+ ]
+ },
+ {
+ "title": "Failures by Error Type",
+ "type": "piechart",
+ "gridPos": {"h": 8, "w": 8, "x": 0, "y": 16},
+ "targets": [
+ {
+ "expr": "sum(increase(aether_datafixers_migrations_failure_total[24h])) by (error_type)",
+ "legendFormat": "{{error_type}}"
+ }
+ ]
+ },
+ {
+ "title": "Failure Rate by Domain",
+ "type": "timeseries",
+ "gridPos": {"h": 8, "w": 8, "x": 8, "y": 16},
+ "targets": [
+ {
+ "expr": "sum(rate(aether_datafixers_migrations_failure_total[5m])) by (domain)",
+ "legendFormat": "{{domain}}"
+ }
+ ],
+ "fieldConfig": {
+ "defaults": {
+ "unit": "ops"
+ }
+ }
+ },
+ {
+ "title": "Recent Errors (Last 1h)",
+ "type": "table",
+ "gridPos": {"h": 8, "w": 8, "x": 16, "y": 16},
+ "targets": [
+ {
+ "expr": "sum(increase(aether_datafixers_migrations_failure_total[1h])) by (domain, error_type) > 0",
+ "format": "table",
+ "instant": true
+ }
+ ],
+ "transformations": [
+ {
+ "id": "organize",
+ "options": {
+ "renameByName": {
+ "domain": "Domain",
+ "error_type": "Error Type",
+ "Value": "Count"
+ }
+ }
+ }
+ ]
+ }
+ ]
+}
+```
+
+### Dashboard Import
+
+1. Go to Grafana > Dashboards > Import
+2. Paste the JSON above
+3. Select your Prometheus data source
+4. Click Import
+
+---
+
+## Actuator Integration
+
+### Health Endpoint
+
+The DataFixer health indicator reports UP/DOWN status per domain.
+
+**Request:**
+```bash
+curl http://localhost:8080/actuator/health
+```
+
+**Response:**
+```json
+{
+ "status": "UP",
+ "components": {
+ "datafixer": {
+ "status": "UP",
+ "details": {
+ "totalDomains": 2,
+ "default.status": "UP",
+ "default.currentVersion": 200,
+ "game.status": "UP",
+ "game.currentVersion": 150
+ }
+ }
+ }
+}
+```
+
+### Custom Endpoint
+
+Get detailed DataFixer information at `/actuator/datafixers`:
+
+```bash
+curl http://localhost:8080/actuator/datafixers
+```
+
+```json
+{
+ "domains": {
+ "default": {
+ "currentVersion": 200,
+ "status": "UP"
+ },
+ "game": {
+ "currentVersion": 150,
+ "status": "UP"
+ }
+ }
+}
+```
+
+### Kubernetes Probes
+
+```yaml
+apiVersion: v1
+kind: Pod
+spec:
+ containers:
+ - name: app
+ livenessProbe:
+ httpGet:
+ path: /actuator/health/liveness
+ port: 8080
+ initialDelaySeconds: 30
+ periodSeconds: 10
+ failureThreshold: 3
+
+ readinessProbe:
+ httpGet:
+ path: /actuator/health/readiness
+ port: 8080
+ initialDelaySeconds: 10
+ periodSeconds: 5
+ failureThreshold: 3
+
+ startupProbe:
+ httpGet:
+ path: /actuator/health
+ port: 8080
+ initialDelaySeconds: 5
+ periodSeconds: 5
+ failureThreshold: 30
+```
+
+### Prometheus Scraping Actuator
+
+```yaml
+# prometheus.yml
+scrape_configs:
+ - job_name: 'spring-actuator'
+ metrics_path: '/actuator/prometheus'
+ static_configs:
+ - targets: ['app:8080']
+ scrape_interval: 15s
+```
+
+---
+
+## Multi-Domain Monitoring
+
+### Per-Domain Dashboards
+
+Use Grafana variables to filter by domain:
+
+```json
+{
+ "templating": {
+ "list": [
+ {
+ "name": "domain",
+ "type": "query",
+ "query": "label_values(aether_datafixers_migrations_success_total, domain)",
+ "refresh": 2
+ }
+ ]
+ }
+}
+```
+
+Then use `domain=~\"$domain\"` in queries:
+
+```promql
+sum(rate(aether_datafixers_migrations_success_total{domain=~"$domain"}[5m]))
+```
+
+### Cross-Domain Comparison
+
+Compare performance across domains:
+
+```promql
+# Success rate by domain
+(
+ sum(rate(aether_datafixers_migrations_success_total[1h])) by (domain)
+ / (
+ sum(rate(aether_datafixers_migrations_success_total[1h])) by (domain)
+ + sum(rate(aether_datafixers_migrations_failure_total[1h])) by (domain)
+ )
+) * 100
+```
+
+---
+
+## PagerDuty Integration
+
+### Alertmanager Configuration
+
+```yaml
+# alertmanager.yml
+global:
+ pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'
+
+route:
+ receiver: 'default'
+ routes:
+ - match:
+ severity: critical
+ receiver: 'pagerduty-critical'
+ - match:
+ severity: warning
+ receiver: 'slack-warning'
+
+receivers:
+ - name: 'default'
+ email_configs:
+ - to: 'team@example.com'
+
+ - name: 'pagerduty-critical'
+ pagerduty_configs:
+ - service_key: ''
+ description: '{{ .CommonAnnotations.summary }}'
+ details:
+ runbook: '{{ .CommonAnnotations.runbook_url }}'
+ domain: '{{ .CommonLabels.domain }}'
+
+ - name: 'slack-warning'
+ slack_configs:
+ - api_url: ''
+ channel: '#alerts'
+ title: '{{ .CommonAnnotations.summary }}'
+ text: '{{ .CommonAnnotations.description }}'
+```
+
+---
+
+## Application Configuration
+
+### Enable Metrics
+
+```yaml
+# application.yml
+aether:
+ datafixers:
+ enabled: true
+ metrics:
+ timing: true
+ counting: true
+
+management:
+ endpoints:
+ web:
+ exposure:
+ include: health, info, prometheus, datafixers
+ metrics:
+ export:
+ prometheus:
+ enabled: true
+ endpoint:
+ health:
+ show-details: always
+```
+
+### Custom Metrics Extension
+
+```java
+@Component
+public class ExtendedMigrationMetrics extends MigrationMetrics {
+
+ private final Counter largeSpanCounter;
+
+ public ExtendedMigrationMetrics(MeterRegistry registry) {
+ super(registry);
+ this.largeSpanCounter = Counter.builder("aether.datafixers.migrations.large_span")
+ .description("Migrations with version span > 100")
+ .register(registry);
+ }
+
+ @Override
+ public void recordSuccess(String domain, int fromVersion, int toVersion, Duration duration) {
+ super.recordSuccess(domain, fromVersion, toVersion, duration);
+
+ // Track large version spans separately
+ if (Math.abs(toVersion - fromVersion) > 100) {
+ largeSpanCounter.increment();
+ }
+ }
+}
+```
+
+---
+
+## Related
+
+- [Spring Boot Metrics](../spring-boot/metrics.md) — Complete metrics reference
+- [Spring Boot Actuator](../spring-boot/actuator.md) — Actuator integration
+- [Debugging Guide](debugging-guide.md) — Diagnosing issues
+- [Recovery Procedures](recovery-procedures.md) — Responding to alerts
diff --git a/docs/operations/recovery-procedures.md b/docs/operations/recovery-procedures.md
new file mode 100644
index 0000000..43b5eac
--- /dev/null
+++ b/docs/operations/recovery-procedures.md
@@ -0,0 +1,583 @@
+# Recovery Procedures
+
+How to recover from migration failures, data issues, and production incidents.
+
+## Quick Reference
+
+| Scenario | Procedure | Complexity |
+|--------------------------|------------------------|------------|
+| Single record failure | Retry with diagnostics | Low |
+| Batch failure (< 5%) | Isolate and retry | Medium |
+| High failure rate (> 5%) | Stop, investigate, fix | High |
+| Data corruption | Restore from backup | High |
+| Schema mismatch | Version alignment | Medium |
+
+---
+
+## Backup Recommendations
+
+### Pre-Migration Backup Strategy
+
+**Before major version bumps:**
+1. Create full database backup
+2. Verify backup integrity (test restore)
+3. Document current schema version
+4. Keep backup for rollback window (e.g., 7 days)
+
+**Before routine operations:**
+1. Enable point-in-time recovery
+2. Verify incremental backups are current
+3. Document migration batch parameters
+
+### Backup Checklist
+
+```markdown
+## Pre-Migration Backup Checklist
+
+- [ ] Database backup completed
+- [ ] Backup verified (test restore on staging)
+- [ ] Backup retention policy confirmed
+- [ ] Schema version documented in backup metadata
+- [ ] Rollback procedure documented
+- [ ] Team notified of migration window
+```
+
+### Database Backup Patterns
+
+**PostgreSQL:**
+```bash
+# Full backup before migration
+pg_dump -Fc -f backup_v100_$(date +%Y%m%d).dump mydb
+
+# With version in filename
+pg_dump -Fc -f backup_schema_v100_to_v200_$(date +%Y%m%d_%H%M%S).dump mydb
+```
+
+**MongoDB:**
+```bash
+# Full backup
+mongodump --db mydb --out ./backup_v100_$(date +%Y%m%d)
+
+# Specific collection
+mongodump --db mydb --collection players --out ./backup_players_v100
+```
+
+### Application-Level Snapshots
+
+```java
+// Create pre-migration snapshot for critical records
+public void createMigrationSnapshot(List recordIds) {
+ Path snapshotDir = Path.of("snapshots",
+ "migration_" + System.currentTimeMillis());
+ Files.createDirectories(snapshotDir);
+
+ for (String id : recordIds) {
+ Dynamic> data = loadRecord(id);
+ Path file = snapshotDir.resolve(id + ".json");
+ Files.writeString(file, serializeToJson(data));
+ }
+
+ logger.info("Created snapshot of {} records at {}",
+ recordIds.size(), snapshotDir);
+}
+```
+
+---
+
+## Partial Migration Recovery
+
+### Detecting Partial Migrations
+
+**Symptoms:**
+- Some records at old version, some at new version
+- Inconsistent data across related entities
+- `aether.datafixers.migrations.failure` spike followed by recovery
+
+**Detection Query (SQL):**
+```sql
+-- Find version distribution
+SELECT data_version, COUNT(*) as count
+FROM entities
+WHERE type = 'player'
+GROUP BY data_version
+ORDER BY data_version;
+
+-- Find records still at old version
+SELECT id, data_version, updated_at
+FROM entities
+WHERE type = 'player'
+ AND data_version < 200
+ORDER BY updated_at DESC;
+```
+
+**Detection Query (MongoDB):**
+```javascript
+// Version distribution
+db.entities.aggregate([
+ { $match: { type: "player" } },
+ { $group: { _id: "$dataVersion", count: { $sum: 1 } } },
+ { $sort: { _id: 1 } }
+]);
+
+// Records at old version
+db.entities.find({
+ type: "player",
+ dataVersion: { $lt: 200 }
+}).sort({ updatedAt: -1 });
+```
+
+### Recovery Option 1: Retry Failed Records
+
+Best for: Small number of failures, transient errors.
+
+```java
+public class MigrationRetryService {
+
+ private final AetherDataFixer fixer;
+ private final Logger logger = LoggerFactory.getLogger(getClass());
+
+ public void retryFailedRecords(List failedIds, int targetVersion) {
+ int success = 0;
+ int failed = 0;
+
+ for (String id : failedIds) {
+ try {
+ // Load record
+ Dynamic> data = loadRecord(id);
+ int currentVersion = extractVersion(data);
+
+ // Skip if already migrated
+ if (currentVersion >= targetVersion) {
+ logger.info("Record {} already at version {}", id, currentVersion);
+ continue;
+ }
+
+ // Enable diagnostics for retry
+ DiagnosticContext ctx = DiagnosticContext.create(
+ DiagnosticOptions.builder()
+ .captureSnapshots(true)
+ .build()
+ );
+
+ // Retry migration
+ Dynamic> result = fixer.update(
+ TypeReferences.PLAYER,
+ data,
+ new DataVersion(currentVersion),
+ new DataVersion(targetVersion),
+ ctx
+ );
+
+ // Save result
+ saveRecord(id, result);
+ success++;
+
+ } catch (DataFixerException e) {
+ failed++;
+ logger.error("Retry failed for record {}: {} [{}]",
+ id, e.getMessage(), e.getContext());
+ }
+ }
+
+ logger.info("Retry complete: {} success, {} failed", success, failed);
+ }
+}
+```
+
+### Recovery Option 2: Isolate and Skip
+
+Best for: Specific data patterns causing failures.
+
+```java
+public class MigrationIsolationService {
+
+ public void migrateWithIsolation(Stream> records, int targetVersion) {
+ List quarantined = new ArrayList<>();
+
+ records.forEach(data -> {
+ String id = extractId(data);
+ try {
+ Dynamic> result = fixer.update(
+ TypeReferences.PLAYER,
+ data,
+ extractVersion(data),
+ targetVersion
+ );
+ saveRecord(id, result);
+ } catch (DataFixerException e) {
+ // Quarantine failed record
+ quarantined.add(id);
+ saveToQuarantine(id, data, e);
+ logger.warn("Quarantined record {}: {}", id, e.getMessage());
+ }
+ });
+
+ if (!quarantined.isEmpty()) {
+ logger.warn("Migration complete with {} quarantined records", quarantined.size());
+ notifyTeam(quarantined);
+ }
+ }
+
+ private void saveToQuarantine(String id, Dynamic> data, DataFixerException e) {
+ // Save to quarantine table/collection for manual review
+ QuarantineRecord record = new QuarantineRecord(
+ id,
+ serializeToJson(data),
+ e.getClass().getSimpleName(),
+ e.getMessage(),
+ e.getContext(),
+ Instant.now()
+ );
+ quarantineRepository.save(record);
+ }
+}
+```
+
+### Recovery Option 3: Manual Intervention
+
+Best for: Complex data issues requiring human judgment.
+
+```java
+public class ManualRecoveryService {
+
+ public void exportForManualReview(List recordIds) {
+ Path exportDir = Path.of("manual_review",
+ LocalDate.now().toString());
+ Files.createDirectories(exportDir);
+
+ for (String id : recordIds) {
+ Dynamic> data = loadRecord(id);
+
+ // Export with metadata
+ Map export = new LinkedHashMap<>();
+ export.put("id", id);
+ export.put("currentVersion", extractVersion(data));
+ export.put("targetVersion", CURRENT_VERSION);
+ export.put("data", data.getValue());
+ export.put("exportedAt", Instant.now().toString());
+
+ Path file = exportDir.resolve(id + ".json");
+ Files.writeString(file, prettyJson(export));
+ }
+
+ logger.info("Exported {} records to {} for manual review",
+ recordIds.size(), exportDir);
+ }
+
+ public void importManualFixes(Path fixesDir) {
+ try (Stream files = Files.list(fixesDir)) {
+ files.filter(p -> p.toString().endsWith(".json"))
+ .forEach(file -> {
+ try {
+ Map fixed = parseJson(Files.readString(file));
+ String id = (String) fixed.get("id");
+ Object data = fixed.get("data");
+ int version = ((Number) fixed.get("fixedVersion")).intValue();
+
+ saveRecord(id, createDynamic(data, version));
+ logger.info("Imported manual fix for record {}", id);
+ } catch (Exception e) {
+ logger.error("Failed to import {}: {}", file, e.getMessage());
+ }
+ });
+ }
+ }
+}
+```
+
+---
+
+## Rollback Strategies
+
+### Important: Forward-Only Design
+
+Aether Datafixers is designed for **forward migration only**. True rollback requires:
+
+1. **Restore from backup** (recommended)
+2. **Write compensating fixes** (complex, not recommended)
+
+### Restore from Backup
+
+**Full Restore:**
+
+```bash
+# PostgreSQL
+pg_restore -d mydb backup_v100_20240115.dump
+
+# MongoDB
+mongorestore --db mydb ./backup_v100_20240115/mydb
+```
+
+**Selective Restore (specific records):**
+
+```sql
+-- PostgreSQL: Restore specific records from backup
+-- 1. Restore backup to temporary schema
+CREATE SCHEMA backup_restore;
+pg_restore -d mydb -n backup_restore backup_v100.dump
+
+-- 2. Copy specific records
+INSERT INTO entities (id, type, data, data_version)
+SELECT id, type, data, data_version
+FROM backup_restore.entities
+WHERE id IN ('record1', 'record2', 'record3')
+ON CONFLICT (id) DO UPDATE
+SET data = EXCLUDED.data, data_version = EXCLUDED.data_version;
+
+-- 3. Clean up
+DROP SCHEMA backup_restore CASCADE;
+```
+
+### Compensating Fixes (Advanced)
+
+Only use when backup is unavailable and you understand the exact transformations to reverse.
+
+```java
+// Example: Reverse a field rename (name -> displayName back to name)
+public class ReverseRenameDisplayNameFix extends SchemaDataFix {
+
+ public ReverseRenameDisplayNameFix(Schema inputSchema, Schema outputSchema) {
+ super("reverse_rename_display_name", inputSchema, outputSchema);
+ }
+
+ @Override
+ protected TypeRewriteRule makeRule(Schema inputSchema, Schema outputSchema) {
+ return Rules.renameField(
+ TypeReferences.PLAYER,
+ "displayName", // current name
+ "name" // original name
+ );
+ }
+}
+```
+
+**Warning:** Compensating fixes are error-prone. Prefer backup restoration.
+
+---
+
+## Error Recovery Workflows
+
+### Workflow 1: FixException Recovery
+
+```
+1. Extract exception context
+ └─ Get fixName, fromVersion, toVersion, typeReference
+
+2. Enable DiagnosticContext
+ └─ captureSnapshots(true), captureRuleDetails(true)
+
+3. Reproduce with single record
+ └─ Run migration on isolated test record
+
+4. Analyze MigrationReport
+ └─ Check fix.beforeSnapshot vs fix.afterSnapshot
+ └─ Find exact rule that failed
+
+5. Identify root cause
+ ├─ Missing field? → Check input data
+ ├─ Wrong type? → Check codec/schema
+ └─ Logic error? → Check fix implementation
+
+6. Fix data or code
+ ├─ Data issue → Clean/transform data
+ └─ Code issue → Deploy fix, redeploy
+
+7. Retry migration
+ └─ Process failed records
+```
+
+### Workflow 2: DecodeException Recovery
+
+```
+1. Get path from exception
+ └─ e.getPath() returns "player.inventory[0].item"
+
+2. Navigate to problematic field
+ └─ Use path to find exact location in data
+
+3. Determine expected vs actual type
+ └─ Check schema definition
+ └─ Compare with actual data
+
+4. Clean/transform data
+ ├─ Missing field? → Add default value
+ ├─ Wrong type? → Convert or remove
+ └─ Malformed? → Parse and fix
+
+5. Retry migration
+```
+
+### Workflow 3: RegistryException Recovery
+
+```
+1. Check missing type/version
+ └─ e.getMissingType() or e.getMissingVersion()
+
+2. Verify bootstrap registration
+ └─ Check DataFixerBootstrap implementation
+
+3. Check version chain completeness
+ └─ Ensure no gaps in version sequence
+
+4. Add missing registrations
+ └─ Register missing type or schema
+
+5. Redeploy and retry
+```
+
+---
+
+## Incident Response
+
+### Severity Levels
+
+| Level | Criteria | Response Time | Escalation |
+|-------|----------|---------------|------------|
+| P1 | All migrations failing | Immediate | On-call + Lead + Manager |
+| P2 | > 5% failure rate | 15 min | On-call + Lead |
+| P3 | > 1% failure rate | 1 hour | On-call |
+| P4 | Isolated failures | 4 hours | Next business day |
+
+### Incident Response Checklist
+
+#### Initial Response (0-5 min)
+
+```markdown
+## Initial Response Checklist
+
+- [ ] Acknowledge alert
+- [ ] Check metrics dashboard
+ - Current failure rate
+ - Error type breakdown
+ - Affected domains
+- [ ] Review recent deployments (last 24h)
+- [ ] Check actuator health endpoint
+- [ ] Initial assessment posted to incident channel
+```
+
+#### Investigation (5-30 min)
+
+```markdown
+## Investigation Checklist
+
+- [ ] Enable DEBUG logging for de.splatgames.aether.datafixers
+- [ ] Capture sample failures (3-5 records)
+- [ ] Enable DiagnosticContext on sample records
+- [ ] Analyze MigrationReport for patterns
+- [ ] Check database/storage health
+- [ ] Check upstream service health
+- [ ] Root cause hypothesis documented
+```
+
+#### Resolution
+
+```markdown
+## Resolution Checklist
+
+- [ ] Root cause confirmed
+- [ ] Fix identified
+ - [ ] Data fix (transformation/cleanup)
+ - [ ] Code fix (bug fix)
+ - [ ] Configuration fix (settings change)
+- [ ] Fix tested in staging
+- [ ] Fix deployed to production
+- [ ] Metrics returning to normal
+- [ ] Failed records reprocessed
+```
+
+#### Post-Incident
+
+```markdown
+## Post-Incident Checklist
+
+- [ ] Timeline documented
+- [ ] Root cause analysis complete
+- [ ] Post-mortem scheduled (within 48h)
+- [ ] Runbook updated if needed
+- [ ] Preventive measures identified
+- [ ] Follow-up tasks created
+```
+
+---
+
+## Data Validation After Recovery
+
+### Consistency Checks
+
+```sql
+-- Check for version consistency
+SELECT
+ type,
+ MIN(data_version) as min_version,
+ MAX(data_version) as max_version,
+ COUNT(*) as count
+FROM entities
+GROUP BY type;
+
+-- Check for orphaned references
+SELECT e.id, e.type
+FROM entities e
+LEFT JOIN entities parent ON e.parent_id = parent.id
+WHERE e.parent_id IS NOT NULL AND parent.id IS NULL;
+```
+
+### Version Alignment
+
+```java
+public void verifyVersionAlignment(int expectedVersion) {
+ // Count records at wrong version
+ long wrongVersion = entityRepository.countByDataVersionNot(expectedVersion);
+
+ if (wrongVersion > 0) {
+ logger.error("Found {} records at wrong version (expected {})",
+ wrongVersion, expectedVersion);
+
+ // List samples
+ List samples = entityRepository
+ .findByDataVersionNot(expectedVersion, PageRequest.of(0, 10));
+
+ for (Entity e : samples) {
+ logger.error(" {} at version {} (expected {})",
+ e.getId(), e.getDataVersion(), expectedVersion);
+ }
+ } else {
+ logger.info("All records at expected version {}", expectedVersion);
+ }
+}
+```
+
+### Functional Verification
+
+```java
+@Test
+void verifyMigrationSuccess() {
+ // Load sample migrated records
+ List samples = entityRepository.findRandomSample(100);
+
+ for (Entity entity : samples) {
+ // Verify can decode at current version
+ assertDoesNotThrow(() -> {
+ Typed> typed = fixer.decode(
+ CURRENT_VERSION,
+ entity.getTypeReference(),
+ entity.getData()
+ );
+ assertNotNull(typed.getValue());
+ }, "Failed to decode entity " + entity.getId());
+
+ // Verify key fields present
+ Dynamic> data = entity.getData();
+ assertTrue(data.get("id").asString().result().isPresent());
+ assertTrue(data.get("_version").asNumber().result().isPresent());
+ }
+}
+```
+
+---
+
+## Related
+
+- [Error Scenarios](error-scenarios.md) — Exception handling reference
+- [Debugging Guide](debugging-guide.md) — Diagnosing issues
+- [Monitoring & Alerting](monitoring-alerting.md) — Detecting problems
+- [Troubleshooting](../troubleshooting/index.md) — Quick fixes
diff --git a/docs/troubleshooting/index.md b/docs/troubleshooting/index.md
index 1012d62..98544fa 100644
--- a/docs/troubleshooting/index.md
+++ b/docs/troubleshooting/index.md
@@ -8,6 +8,12 @@ Solutions to common issues with Aether Datafixers.
- [Debugging Tips](debugging-tips.md) — Strategies for finding issues
- [FAQ](faq.md) — Frequently asked questions
+## Operations Runbook
+
+For production operations, incident response, and recovery procedures, see the [Operations Runbook](../operations/index.md).
+
+---
+
## Quick Fixes
### Migration Not Applied
diff --git a/scripts/structure.py b/scripts/structure.py
new file mode 100644
index 0000000..fdf1d0f
--- /dev/null
+++ b/scripts/structure.py
@@ -0,0 +1,104 @@
+import os
+import fnmatch
+import argparse
+
+def apply_color(text, color_code, use_colors):
+ return f"\033[{color_code}m{text}\033[0m" if use_colors else text
+
+def print_structure(dir_path, depth, file_pattern, use_colors, excluded_folders):
+ try:
+ entries = os.listdir(dir_path)
+ except Exception:
+ return
+
+ if not entries:
+ return
+
+ full_paths = [os.path.join(dir_path, entry) for entry in entries]
+ full_paths.sort(key=lambda p: os.path.isfile(p))
+
+ for path in full_paths:
+ name = os.path.basename(path)
+ if name in {".", ".."} or name in excluded_folders:
+ continue
+ indent = " " * (depth * 4)
+ if os.path.isdir(path):
+ print(f"{indent}" + apply_color(f"📁 {name}", "34", use_colors))
+ print_structure(path, depth + 1, file_pattern, use_colors, excluded_folders)
+ elif fnmatch.fnmatch(name, file_pattern):
+ print(f"{indent} " + apply_color(f"📄 {name}", "32", use_colors))
+
+def print_dependency_tree(dir_path, depth=0, file_pattern="*.java", use_colors=True, excluded_folders=set()):
+ try:
+ entries = os.listdir(dir_path)
+ except Exception:
+ return
+
+ entries.sort()
+ for entry in entries:
+ full_path = os.path.join(dir_path, entry)
+ if entry in excluded_folders or entry in {".", ".."}:
+ continue
+ indent = "| " * depth + "|-- "
+ if os.path.isdir(full_path):
+ print(indent + apply_color(f"{entry}/", "34", use_colors))
+ print_dependency_tree(full_path, depth + 1, file_pattern, use_colors, excluded_folders)
+ elif fnmatch.fnmatch(entry, file_pattern):
+ print(indent + apply_color(entry, "32", use_colors))
+
+def export_structure_to_file(output_file, mode, file_pattern, root_path, excluded_folders):
+ with open(output_file, "w", encoding="utf-8") as f:
+ def write_structure(dir_path, depth):
+ try:
+ entries = os.listdir(dir_path)
+ except Exception:
+ return
+
+ entries.sort()
+ for entry in entries:
+ full_path = os.path.join(dir_path, entry)
+ if entry in excluded_folders or entry in {".", ".."}:
+ continue
+ indent = " " * depth
+ if os.path.isdir(full_path):
+ f.write(f"{indent}{entry}/\n")
+ write_structure(full_path, depth + 1)
+ elif fnmatch.fnmatch(entry, file_pattern):
+ f.write(f"{indent}{entry}\n")
+
+ f.write("Package Structure:\n")
+ if mode == "tree":
+ f.write(f"ROOT ({root_path})\n")
+ write_structure(root_path, 1)
+ else:
+ f.write(f"ROOT ({root_path})\n")
+ write_structure(root_path, 1)
+
+def print_package_structure(root_path, mode="default", file_pattern="*.java", output_file=None, use_colors=True, excluded_folders=set()):
+ if not os.path.exists(root_path) or not os.path.isdir(root_path):
+ print("❌ Root directory does not exist: " + root_path)
+ return
+
+ print(f"\n📂 Package Structure ({root_path}):")
+ if mode == "tree":
+ print("|-- ROOT")
+ print_dependency_tree(root_path, 1, file_pattern, use_colors, excluded_folders)
+ else:
+ print("📁 ROOT")
+ print_structure(root_path, 1, file_pattern, use_colors, excluded_folders)
+
+ if output_file:
+ export_structure_to_file(output_file, mode, file_pattern, root_path, excluded_folders)
+ print(f"\n✅ Structure exported to {output_file}")
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Print the package structure of a Java project.")
+ parser.add_argument("root_path", type=str, help="Root directory of the project")
+ parser.add_argument("mode", nargs="?", choices=["default", "tree"], default="default", help="Output mode")
+ parser.add_argument("--filter", type=str, default="*.java", help="Filter files by wildcard pattern (e.g., '*.java')")
+ parser.add_argument("--output", type=str, help="Export output to a file")
+ parser.add_argument("--no-color", action="store_true", help="Disable colored output")
+ parser.add_argument("--exclude", type=str, nargs="*", default=["target", "build", ".git", "node_modules"], help="Folders to exclude")
+
+ args = parser.parse_args()
+ print_package_structure(args.root_path, args.mode, args.filter, args.output, not args.no_color, set(args.exclude))