diff --git a/README.md b/README.md
index 7a1797b..f0f75c7 100644
--- a/README.md
+++ b/README.md
@@ -149,9 +149,9 @@ See [docs/DLOCKSS_PROTOCOL.md](docs/DLOCKSS_PROTOCOL.md) for protocol details.
 D-LOCKSS acts as a self-healing, sharded storage cluster using the IPFS/Libp2p stack.
 
 ### Key Components
-1.  **Shard Manager:** Dynamically splits responsibilities based on peer count to maintain scalability.
+1.  **Shard Manager:** Dynamically splits responsibilities based on peer count to maintain scalability. Delegates lifecycle decisions (split/merge/discovery) to a `lifecycleManager` and replication to a `replicationManager`.
 2.  **Cluster Manager:** Manages embedded **IPFS Cluster** instances (one per shard) using **CRDTs** for state consensus; nodes in a shard sync and pin content assigned to that shard.
-3.  **File Watcher:** Monitors the data directory to automatically ingest content.
+3.  **File Watcher:** Monitors the data directory to automatically ingest content (via `handleWatcherEvent` / `handleNewDirectory`).
 4.  **Storage Monitor:** Protects nodes from disk exhaustion by rejecting custodial requests when full.
 5.  **BadBits Manager:** Enforces content blocking (e.g., DMCA) based on configured country codes.
 
@@ -186,14 +186,6 @@ go build -o dlockss-monitor ./cmd/dlockss-monitor
 ```
 Open http://localhost:8080. The monitor displays each node's **name** (if configured via `DLOCKSS_NODE_NAME`), falling back to the Peer ID. Names propagate via HEARTBEAT/JOIN messages and appear in the node table, charts, and shard modals. Client-side aliases (EDIT button) override server-side names. Each node has **one peer ID**: when `DLOCKSS_IPFS_CONFIG` is set (e.g. in testnet), D-LOCKSS uses the IPFS repo identity so the same ID appears in the monitor and in `node_x.ipfs.log`.
 
-For geographic region display, optionally provide a GeoIP database:
-```bash
-./dlockss-monitor --geoip-db /path/to/GeoLite2-City.mmdb
-# or via environment variable:
-export DLOCKSS_MONITOR_GEOIP_DB=/path/to/GeoLite2-City.mmdb
-```
-Without a local database, the monitor falls back to the ip-api.com batch API with permanent caching.
-
 The monitor bootstrap-subscribes to all shards up to depth 6 (127 shards) so it can see nodes even when started late. Set `DLOCKSS_MONITOR_BOOTSTRAP_SHARD_DEPTH` (0–12) to tune.
 
 Alternatively use: https://dlockss-monitor.wmcloud.org.
@@ -207,7 +199,7 @@ go test ./... -v
 ```
 
 ### Project Status
-*   **Current Phase:** Production — active refactoring for code quality and operational robustness (see [Code Elegance Plan](docs/CODE_ELEGANCE_PLAN.md)).
+*   **Current Phase:** Production — structural refactoring complete (see [Code Elegance Plan](docs/CODE_ELEGANCE_PLAN.md)). Config uses nested sub-structs (`Sharding`, `Replication`, `Files`, `Security`, `Orphan`). ShardManager delegates to `replicationManager` and `lifecycleManager`.
 
 ---
 
diff --git a/cmd/dlockss-monitor/main.go b/cmd/dlockss-monitor/main.go
index 06e0422..1ecabb4 100644
--- a/cmd/dlockss-monitor/main.go
+++ b/cmd/dlockss-monitor/main.go
@@ -4,7 +4,6 @@ package main
 import (
 	"context"
 	"errors"
-	"flag"
 	"fmt"
 	"log"
 	"log/slog"
@@ -21,9 +20,6 @@ import (
 func main() {
 	slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo})))
 
-	geoipDB := flag.String("geoip-db", "", "Path to a MaxMind/DB-IP .mmdb GeoIP database file")
-	flag.Parse()
-
 	ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
 	defer cancel()
 
@@ -49,11 +45,7 @@ func main() {
 		slog.Info("topic name from env", "topic", cfg.TopicName)
 	}
 
-	geoDBPath := *geoipDB
-	if geoDBPath == "" {
-		geoDBPath = os.Getenv("DLOCKSS_MONITOR_GEOIP_DB")
-	}
-	m := monitor.NewMonitor(cfg, geoDBPath)
+	m := monitor.NewMonitor(cfg)
 	defer m.Close()
 
 	h, err := monitor.StartLibP2P(ctx, m)
diff --git a/cmd/dlockss/main.go b/cmd/dlockss/main.go
index ea1d23f..c68d408 100644
--- a/cmd/dlockss/main.go
+++ b/cmd/dlockss/main.go
@@ -22,7 +22,6 @@ import (
 	"dlockss/internal/managers/shard"
 	"dlockss/internal/managers/storage"
 	"dlockss/internal/signing"
-	"dlockss/internal/telemetry"
 	"dlockss/internal/trust"
 	"dlockss/pkg/ipfs"
 	"dlockss/pkg/schema"
@@ -176,8 +175,8 @@ func main() {
 	go discovery.RunPeerFinder(ctx, h, routingDiscovery, cfg.DiscoveryServiceTag)
 
 	// Trust (optional: load peers if file exists)
-	trustMgr := trust.NewTrustManager(cfg.TrustMode)
-	if err := trustMgr.LoadTrustedPeers(cfg.TrustStorePath); err != nil && !os.IsNotExist(err) {
+	trustMgr := trust.NewTrustManager(cfg.Security.TrustMode)
+	if err := trustMgr.LoadTrustedPeers(cfg.Security.TrustStorePath); err != nil && !os.IsNotExist(err) {
 		slog.Warn("trust store load failed", "error", err)
 	}
 
@@ -191,8 +190,7 @@ func main() {
 	defer dstore.Close()
 
 	rateLimiter := common.NewRateLimiter(cfg.RateLimitWindow, cfg.MaxMessagesPerWindow)
-	metrics := telemetry.NewMetricsManager(cfg)
-	storageMgr := storage.NewStorageManager(cfg, dht, metrics, badBitsFilter)
+	storageMgr := storage.NewStorageManager(cfg, dht, badBitsFilter)
 	signer := signing.NewSigner(signing.SignerConfig{
 		Cfg:      cfg,
 		Host:     h,
@@ -213,7 +211,7 @@ func main() {
 		}
 		// Provide manifest in its own goroutine with its own timeout.
 		go func() {
-			pctx, pcancel := context.WithTimeout(ctx, cfg.DHTProvideTimeout)
+			pctx, pcancel := context.WithTimeout(ctx, cfg.Files.DHTProvideTimeout)
 			defer pcancel()
 			storageMgr.ProvideFile(pctx, manifestCIDStr)
 		}()
@@ -224,7 +222,7 @@ func main() {
 		// this call adds the missing pin entry.  Blocks are already local
 		// from the manifest's recursive pin so this returns quickly.
 		go func() {
-			pctx, pcancel := context.WithTimeout(ctx, cfg.DHTProvideTimeout)
+			pctx, pcancel := context.WithTimeout(ctx, cfg.Files.DHTProvideTimeout)
 			defer pcancel()
 			manifestCID, err := cid.Decode(manifestCIDStr)
 			if err != nil {
@@ -274,7 +272,6 @@ func main() {
 		PubSub:      ps,
 		IPFSClient:  ipfsClient,
 		Storage:     storageMgr,
-		Metrics:     metrics,
 		Signer:      signer,
 		RateLimiter: rateLimiter,
 		Cluster:     clusterMgr,
@@ -286,17 +283,7 @@ func main() {
 	clusterMgr.SetShardPeerProvider(shardMgr) // CRDT Peers() and allocations use real shard membership
 	announcePinned = shardMgr.AnnouncePinned
 
-	metrics.RegisterProviders(shardMgr, storageMgr, rateLimiter)
-	metrics.RegisterClusterProvider(clusterMgr) // cluster-style metrics: pins/peers/allocations per shard
-	metrics.SetPeerID(h.ID().String())
-
-	// Telemetry and API
-	tc := telemetry.NewTelemetryClient(cfg, h, ps, metrics)
-	if tc != nil {
-		tc.SetShardPublisher(shardMgr, shardMgr)
-		tc.Start(ctx)
-	}
-	apiServer := api.NewAPIServer(cfg.APIPort, metrics)
+	apiServer := api.NewAPIServer(cfg.APIPort)
 	apiServer.Start()
 
 	// File processor and watcher
@@ -330,6 +317,7 @@ func main() {
 
 	<-ctx.Done()
 	slog.Info("shutting down")
+	fp.Stop()
 	if err := shardMgr.Close(); err != nil {
 		slog.Error("shard manager close error", "error", err)
 	}
diff --git a/docs/DLOCKSS_PROTOCOL.md b/docs/DLOCKSS_PROTOCOL.md
index 65134db..c5b3d77 100644
--- a/docs/DLOCKSS_PROTOCOL.md
+++ b/docs/DLOCKSS_PROTOCOL.md
@@ -171,7 +171,7 @@ Nodes in any shard (not just root) periodically run discovery to join existing d
 
 ### 6.2 Replication & Repair
 
-Replication is handled by the **Cluster Manager** and **LocalPinTracker** per shard:
+Replication is handled by the **Cluster Manager**, **LocalPinTracker**, and **replicationManager** (a delegate of ShardManager):
 
 1.  **Pinning**: When a responsible node ingests or accepts a file, it calls `ClusterManager.Pin(ctx, shardID, cid, ...)` on the shard's embedded cluster. Allocations are chosen deterministically from Peers() (shard mesh via ShardPeerProvider).
 2.  **State Sync**: The CRDT (Merkle-DAG based) propagates pin/unpin to all peers in the shard via PubSub (`dlockss-shard-<id>`).
@@ -179,7 +179,10 @@ Replication is handled by the **Cluster Manager** and **LocalPinTracker** per sh
     *   Each node runs a `LocalPinTracker` per shard that polls CRDT State() (and on TriggerSync).
     *   For each pin in state, if this node is in **Allocations** (or Allocations is empty), it pins the ManifestCID locally via IPFS. The **onPinSynced** callback then: (a) registers the file with StorageManager, (b) announces PINNED, (c) resolves the PayloadCID from the manifest, (d) **pins the PayloadCID as its own root** so Kubo's reprovider (`pinned` strategy) re-announces it, and (e) provides both ManifestCID and PayloadCID to the DHT. On the ingesting node the payload is already a pin root from `ImportFile`; on replicas only the ManifestCID was pinned, so step (d) adds the missing pin entry — blocks are already local from the manifest's recursive pin so this returns quickly.
     *   Pins no longer in state or no longer allocated are unpinned locally and onPinRemoved is called.
-4.  **Repair**: Under-replicated files trigger ReplicationRequest on the shard topic; peers that have the file JoinShard(targetShard), Pin, TriggerSync. CRDT sync and LocalPinTracker then replicate to allocated peers.
+4.  **Repair (replicationManager)**: The `replicationManager` (in `shard_replication.go`) periodically broadcasts `ReplicationRequest` messages for all pinned manifests (cooldown: 5 minutes per manifest). Receiving peers handle requests as follows:
+    *   **Already pinned**: Ensure cluster membership and trigger CRDT sync.
+    *   **Not pinned (auto-replication)**: Fetch the manifest via `PinRecursive` (timeout: 5 minutes), add to cluster via `ClusterPinIfAbsent`, and trigger sync. Concurrency is bounded by a semaphore (default: 5 concurrent auto-replications).
+    *   Legacy manifests (containing a `ts` field) are silently ignored.
 
 ### 6.2.1 Heartbeat-Driven Re-Pin and Re-Provide
 
@@ -260,8 +263,8 @@ volumes:
 
 ### 7.2 Integrity & Authenticity
 *   **Content Addressing**: CIDs guarantee content integrity.
-*   **Signatures**: All `ResearchObjects` and protocol messages are signed by the sender's private key.
-*   **Nonces**: Protocol messages include nonces to prevent replay attacks.
+*   **Signatures**: All `ResearchObjects` and protocol messages are signed by the sender's private key. The `Signer` type (in `internal/signing/`) handles signing and verification, with `verifySignedMessage` decomposed into focused steps: field validation (`validateMessageFields`), timestamp checking (`checkTimestamp`), public key retrieval (`fetchPublicKey`), and cryptographic verification (`verifySignatureBytes`).
+*   **Nonces**: Protocol messages include nonces (generated by the signing package's internal `newNonce` function) to prevent replay attacks. A `nonceStore` tracks recently seen nonces.
 
 ### 7.3 Liar Detection
 *   Nodes verify that the actual file size matches the `TotalSize` claimed in the `ResearchObject` manifest.
diff --git a/docs/REPLICATION_PERFORMANCE.md b/docs/REPLICATION_PERFORMANCE.md
index a28a7a2..b67ad52 100644
--- a/docs/REPLICATION_PERFORMANCE.md
+++ b/docs/REPLICATION_PERFORMANCE.md
@@ -1,136 +1,122 @@
 # Replication Performance Analysis
 
+## Architecture Overview
+
+Replication in D-LOCKSS is driven by two complementary mechanisms:
+
+1. **CRDT Cluster Sync** — Each shard runs an embedded IPFS Cluster with CRDT consensus. When a file is pinned to a shard's cluster, the `LocalPinTracker` on every peer in that shard automatically syncs and pins the content locally.
+
+2. **ReplicationRequest Protocol** — The `replicationManager` (extracted from `ShardManager`) periodically broadcasts `ReplicationRequest` messages for pinned manifests. Peers that don't yet have the file perform **auto-replication**: fetch via `PinRecursive` and add to the cluster.
+
+## Key Constants and Defaults
+
+| Parameter | Default | Env Variable | Location |
+|-----------|---------|-------------|----------|
+| Replication Check Interval | 1 minute | `DLOCKSS_CHECK_INTERVAL` | `config.Replication.CheckInterval` |
+| Root Shard Check Interval | 20 seconds | (hardcoded) | `rootReplicationCheckInterval` |
+| Request Cooldown Per Manifest | 5 minutes | (hardcoded) | `replicationRequestCooldownDuration` |
+| Max Requests Per Cycle | 50 | (hardcoded) | `maxReplicationRequestsPerCycle` |
+| Auto-Replication Enabled | true | `DLOCKSS_AUTO_REPLICATION_ENABLED` | `config.Replication.AutoReplicationEnabled` |
+| Auto-Replication Timeout | 5 minutes | `DLOCKSS_AUTO_REPLICATION_TIMEOUT` | `config.Replication.AutoReplicationTimeout` |
+| Max Concurrent Checks | 5 | `DLOCKSS_MAX_CONCURRENT_CHECKS` | `config.Replication.MaxConcurrentReplicationChecks` |
+| Pin Reannounce Interval | 2 minutes | `DLOCKSS_PIN_REANNOUNCE_INTERVAL` | `config.Replication.PinReannounceInterval` |
+| Min Replication | 5 | `DLOCKSS_MIN_REPLICATION` | `config.Replication.MinReplication` |
+| Max Replication | 10 | `DLOCKSS_MAX_REPLICATION` | `config.Replication.MaxReplication` |
+
+## Convergence Timeline
+
+For a newly ingested file to reach full replication across a shard:
+
+1. **Ingest** (immediate): File pinned locally, `IngestMessage` broadcast to shard, cluster `Pin()` called.
+2. **CRDT Sync** (seconds): Cluster state propagates to peers via PubSub; `LocalPinTracker` detects new pin and starts `PinRecursive`.
+3. **First Replication Check** (up to 20s at root, 1m elsewhere): `replicationManager.runChecker()` sends `ReplicationRequest` for all pinned manifests.
+4. **Auto-Replication** (seconds to minutes): Peers receiving the request that don't have the file fetch it via `PinRecursive` (up to 5-minute timeout).
+5. **Cooldown** (5 minutes): After sending a request for a manifest, no new request is sent for that manifest for 5 minutes.
+
+**Typical convergence**: Most files replicate within 1-2 minutes via CRDT sync alone. Files that fail the initial sync (large DAGs, slow block propagation) recover on the next replication cycle after the 5-minute cooldown.
+
 ## Current Bottlenecks
 
-Based on code analysis, the following factors contribute to slow replication convergence:
-
-### 1. **Replication Check Interval** (Default: 1 minute)
-- **Location**: `CheckInterval = 1*time.Minute`
-- **Impact**: Replication levels are only checked once per minute
-- **Effect**: Minimum delay of 1 minute before detecting under-replication
-
-### 2. **Hysteresis Verification Delay** (Default: 30 seconds)
-- **Location**: `ReplicationVerificationDelay = 30*time.Second`
-- **Impact**: When under-replication is detected, system waits ~30 seconds before triggering replication requests
-- **Effect**: Adds 30+ seconds delay before NEED messages are broadcast
-- **Rationale**: Prevents false alarms from transient DHT issues
-
-### 3. **Replication Check Cooldown** (Default: 15 seconds)
-- **Location**: `ReplicationCheckCooldown = 15*time.Second`
-- **Impact**: Prevents checking the same file more than once every 15 seconds
-- **Effect**: Limits how quickly replication can be re-checked after a change
-
-### 4. **Replication Cache TTL** (Default: 5 minutes)
-- **Location**: `ReplicationCacheTTL = 5*time.Minute`
-- **Impact**: Cached replication counts prevent frequent DHT queries
-- **Effect**: Replication counts may be stale for up to 5 minutes
-- **Trade-off**: Reduces DHT load but slows convergence detection
-
-### 5. **DHT Query Timeout** (Default: 2 minutes)
-- **Location**: `context.WithTimeout(ctx, 2*time.Minute)` in `checkReplication()`
-- **Impact**: DHT queries can take up to 2 minutes to timeout
-- **Effect**: Slow DHT queries delay replication checks
-
-### 6. **DHT Max Sample Size** (Default: 50)
-- **Location**: `DHTMaxSampleSize = 50`
-- **Impact**: Limits how many providers are queried per DHT lookup
-- **Effect**: May underestimate replication count in large networks
-
-### 7. **Worker Pool Limit** (Default: 10 concurrent checks)
-- **Location**: `MaxConcurrentReplicationChecks = 10`
-- **Impact**: Limits parallelism of replication checks
-- **Effect**: With many files, checks are serialized
-
-### 8. **Missing Automatic Replication**
-- **Issue**: When a `ReplicationRequest` is received, nodes only check replication - they don't automatically fetch and pin missing files
-- **Impact**: Nodes must already have the file to replicate it
-- **Effect**: Replication requests don't trigger new replication, only verify existing state
-
-## Total Minimum Delay
-
-For a new file to reach target replication:
-1. **Initial check**: Up to 1 minute (CheckInterval)
-2. **Verification delay**: ~30 seconds (ReplicationVerificationDelay)
-3. **Replication request broadcast**: Immediate
-4. **Other nodes check**: Up to 1 minute (their CheckInterval)
-5. **Re-check after replication**: Up to 1 minute + 15 seconds cooldown
-
-**Minimum time to convergence**: ~3-4 minutes in ideal conditions
-**With DHT delays**: Can be 5-10 minutes or more
+### 1. Request Cooldown (5 minutes)
+
+Once a `ReplicationRequest` is sent for a manifest, `replicationRequestCooldownDuration` prevents resending for 5 minutes. If the first request fails (e.g., the receiving peer's `PinRecursive` times out), the file appears "stuck" until the cooldown expires.
+
+**Mitigation**: The cooldown prevents flooding but causes visible delays for files that fail on the first attempt.
+
+### 2. Auto-Replication Timeout (5 minutes)
+
+`PinRecursive` for large files or over slow links may hit the `AutoReplicationTimeout`. The file remains unreplicated until the next replication cycle.
+
+**Mitigation**: Heartbeat-driven re-pin gradually fills in missing blocks (see below).
+
+### 3. Concurrent Replication Limit (5)
+
+The `replicationManager.sem` channel limits concurrent auto-replications to `MaxConcurrentReplicationChecks` (default 5). When all slots are occupied, additional `ReplicationRequest` messages are silently dropped.
+
+**Mitigation**: Increase `DLOCKSS_MAX_CONCURRENT_CHECKS` for nodes with sufficient bandwidth.
+
+### 4. Max Requests Per Cycle (50)
+
+At most 50 `ReplicationRequest` messages are sent per checker cycle. With thousands of files, not all manifests are requested in a single cycle.
+
+**Mitigation**: Subsequent cycles pick up remaining manifests. The cooldown map ensures already-sent requests aren't duplicated.
+
+## Heartbeat-Driven Gradual DAG Completion (Built-In)
+
+Every heartbeat (~10s), each node picks **one** pinned manifest CID (round-robin) and:
+
+1. **Re-pins the ManifestCID recursively** (`PinRecursive`, 2-minute timeout). Idempotent — returns instantly when the DAG is already complete locally, and incrementally fetches missing blocks otherwise.
+2. **Pins the PayloadCID as its own root** so Kubo's reprovider (`pinned` strategy) re-announces it.
+3. **Provides both CIDs to the DHT** (only if the re-pin succeeded).
+
+A `CompareAndSwap` guard prevents concurrent re-provides from piling up.
+
+**Impact**: Resource-constrained nodes (e.g., Raspberry Pis) that failed the initial `PinRecursive` gradually complete the DAG over successive heartbeats without manual intervention. DHT provider records (which expire after ~24h) are kept fresh.
 
 ## Optimization Options
 
-### Option 1: Reduce Check Interval (Quick Win)
+### Reduce Check Interval (Quick Win)
 ```bash
 export DLOCKSS_CHECK_INTERVAL=15s  # Default: 1m
 ```
-**Pros**: Faster detection of under-replication
-**Cons**: More DHT queries, higher CPU usage
-**Recommendation**: Use 15-30s for testnets
+Faster detection at non-root shards. Root shards already check every 20s.
 
-### Option 3: Reduce Replication Cooldown (Quick Win)
+### Increase Concurrent Checks (Moderate Impact)
 ```bash
-export DLOCKSS_REPLICATION_COOLDOWN=5s  # Default: 15s
+export DLOCKSS_MAX_CONCURRENT_CHECKS=10  # Default: 5
 ```
-**Pros**: Faster re-checking after replication changes
-**Cons**: More frequent checks of same files
-**Recommendation**: Use 5s for testnets
+More parallel auto-replications. Higher bandwidth usage.
 
-### Option 5: Increase Worker Pool (Moderate Impact)
+### Increase Auto-Replication Timeout (Large Files)
 ```bash
-export DLOCKSS_MAX_CONCURRENT_CHECKS=20  # Default: 10
+export DLOCKSS_AUTO_REPLICATION_TIMEOUT=10m  # Default: 5m
 ```
-**Pros**: More parallel replication checks
-**Cons**: Higher CPU/memory usage
-**Recommendation**: Use 20-30 for testnets with many files
-
-### Option 8: Implement Automatic Replication (Major Feature)
-**Code Change Required**: Add logic to fetch and pin files when receiving ReplicationRequest
-**Pros**: Actually triggers replication, not just checks
-**Cons**: Requires IPFS content fetching, bandwidth usage
-**Recommendation**: High priority for production
+Allows more time for large DAG fetches. Ties up semaphore slots longer.
 
 ## Recommended Testnet Configuration
 
-For faster convergence in testnets, use:
+For faster convergence in testnets:
 
 ```bash
 export DLOCKSS_CHECK_INTERVAL=15s
-export DLOCKSS_REPLICATION_VERIFICATION_DELAY=5s
-export DLOCKSS_REPLICATION_COOLDOWN=5s
-export DLOCKSS_REPLICATION_CACHE_TTL=30s
-export DLOCKSS_MAX_CONCURRENT_CHECKS=20
-export DLOCKSS_DHT_MAX_SAMPLE_SIZE=100
+export DLOCKSS_MAX_CONCURRENT_CHECKS=10
 ```
 
-This reduces minimum convergence time from ~3-4 minutes to ~30-60 seconds.
-
-### Heartbeat-Driven Gradual DAG Completion (Built-In)
-
-Every heartbeat (~10s), each node picks one pinned manifest (round-robin) and calls `PinRecursive` with a 2-minute timeout. This is idempotent: if the DAG is already fully local it returns instantly, otherwise it incrementally fetches the missing blocks. On success, the manifest and payload CIDs are re-provided to the DHT.
-
-**Impact on resource-constrained nodes (Raspberry Pis):**
-- Initial `PinRecursive` during replication may time out or OOM before fetching the full DAG.
-- Instead of leaving the file permanently incomplete, subsequent heartbeats gradually fetch the remaining blocks.
-- After all blocks are local, Kubo's reprovider stops emitting "block not found locally, cannot provide" errors.
-- DHT provider records (which expire after ~24h) are kept fresh without relying solely on Kubo's reprovider.
-
-No configuration needed — this runs automatically on every node.
-
 ## Production Considerations
 
-For production networks:
-- Keep `ReplicationVerificationDelay` at 30s to prevent false alarms
-- Keep `ReplicationCacheTTL` at 5m to reduce DHT load
-- Keep `CheckInterval` at 1m for reasonable resource usage
-- Consider implementing Option 8 (automatic replication) for better convergence
+- Keep `CheckInterval` at 1m for reasonable resource usage (root shards already use 20s).
+- Keep `AutoReplicationTimeout` at 5m unless dealing with consistently large files.
+- The 5-minute request cooldown is a deliberate trade-off between convergence speed and network overhead; files that fail on the first attempt self-heal after the cooldown expires.
 
 ## Monitoring
 
-Watch these metrics to understand replication performance:
-- `replicationChecks`: Number of checks performed
-- `dhtQueries`: Number of DHT queries
-- `dhtQueryTimeouts`: DHT query failures
-- `filesAtTargetReplication`: Files with adequate replication
-- `lowReplicationFiles`: Files needing replication
-- `avgReplicationLevel`: Average replication across all files
+The monitor's `replication snapshot` log line reports:
+- `total_manifests`: Number of known manifests
+- `total_at_target`: Files with replica count >= min(MinReplication, shard_peer_count)
+- `avg_replication`: Average replica count across all manifests
+
+Node daemon logs to watch:
+- `"auto-replication: fetched and pinned"` — successful auto-replication
+- `"auto-replication: failed to fetch/pin"` — `PinRecursive` timeout or failure
+- `"auto-replication skipped, concurrency limit reached"` — semaphore full
+- `"ReplicationRequest sent"` — outbound request (debug level)
diff --git a/docs/component_diagram.puml b/docs/component_diagram.puml
index 38463f6..04c664c 100644
--- a/docs/component_diagram.puml
+++ b/docs/component_diagram.puml
@@ -2,6 +2,8 @@
 
 package "Core Components" {
     component "ShardManager" as ShardManager
+    component "replicationManager" as ReplMgr
+    component "lifecycleManager" as LifecycleMgr
     component "ClusterManager" as ClusterManager
     component "StorageManager" as StorageManager
     component "FileProcessor" as FileProcessor
@@ -9,6 +11,8 @@ package "Core Components" {
     component "Rate Limiter" as RateLimiter
 }
 
+ShardManager --> ReplMgr : delegates replication
+ShardManager --> LifecycleMgr : delegates split/merge/discovery
 ShardManager --> ClusterManager : joins/leaves shards
 ClusterManager --> StorageManager : onPinSynced/onPinRemoved
 FileProcessor --> StorageManager : pins files, adds known
@@ -19,10 +23,11 @@ Metrics --> ClusterManager : GetClusterMetrics (pins/peers per shard)
 
 note right of ShardManager
     Manages shard assignment and PubSub topics.
-    Lifecycle decisions (split/merge/discovery) delegated
-    to internal lifecycleManager.
-    Replication logic split into focused files:
-    shard_msg_handlers, shard_reshard, shard_maintenance.
+    Lifecycle decisions delegated to lifecycleManager.
+    Replication requests delegated to replicationManager.
+    Code organized into focused files:
+    shard_loops, shard_msg_handlers, shard_reshard,
+    shard_orphan, shard_legacy, shard_replication.
     Stability (v0.0.3):
     - Move cooldown (30s) after any transition
     - Jittered discovery timers
diff --git a/docs/data_structures.puml b/docs/data_structures.puml
index feab079..50744f2 100644
--- a/docs/data_structures.puml
+++ b/docs/data_structures.puml
@@ -59,10 +59,9 @@ class ShardManager {
     - ipfsClient: ipfs.IPFSClient
     - storageMgr: StorageProvider
     - clusterMgr: ClusterManagerInterface
-    - metrics: *telemetry.MetricsManager
     - signer: MessageAuthenticator
     - rateLimiter: *common.RateLimiter
-    - peers: *PeerTracker
+    - peers: *peerTracker
     - mu: sync.RWMutex
     - currentShard: string
     - shardSubs: map[string]*shardSubscription
@@ -70,8 +69,8 @@ class ShardManager {
     - observerOnlyShards: map[string]struct{}
     - reshardedFiles: *common.KnownFiles
     - orphanHandoffSent: map[...]
-    - autoReplicationSem: chan struct{}
     - reprovideInFlight: atomic.Bool
+    - repl: *replicationManager
     - lifecycle: *lifecycleManager
     --
     + NewShardManager(cfg ShardManagerConfig): (*ShardManager, error)
@@ -91,6 +90,25 @@ class ShardManager {
     + AnnouncePinned(manifestCID)
 }
 
+class replicationManager {
+    - ops: replicationOps
+    - mu: sync.Mutex
+    - cooldown: map[string]time.Time
+    - sem: chan struct{}
+    --
+    + runChecker()
+    - sendReplicationRequests(ctx, cfg, shard, manifests)
+    - handleRequest(msg, rr, shardID)
+    - pruneCooldown()
+}
+note right of replicationManager
+  Extracted from ShardManager.
+  Owns replication request cooldown
+  and auto-replication semaphore.
+  Interacts with ShardManager via
+  replicationOps interface.
+end note
+
 class lifecycleManager {
     - ops: lifecycleOps
     - cfg: *config.Config
@@ -108,14 +126,18 @@ class lifecycleManager {
     - recordSplitAnnouncement(child0, child1)
 }
 
+ShardManager --> replicationManager : delegates replication
 ShardManager --> lifecycleManager : delegates split/merge/discovery
 
 note right of ShardManager
   PubSub topics: dlockss-creative-commons-shard-<id>.
   One subscription per shard (refCount); observer mode for probing.
-  Lifecycle decisions (split/merge/discovery) delegated to lifecycleManager.
-  Replication code split into shard_msg_handlers.go, shard_reshard.go,
-  shard_maintenance.go.
+  Lifecycle decisions delegated to lifecycleManager.
+  Replication requests delegated to replicationManager.
+  Code organized into focused files:
+  shard_loops.go (message dispatch, heartbeat, pin reannounce),
+  shard_msg_handlers.go, shard_reshard.go,
+  shard_orphan.go, shard_legacy.go, shard_replication.go.
   Stability (v0.0.3):
   - ShardMoveCooldown (30s) after ANY transition
   - PROBE response rate limiting (5s)
@@ -250,4 +272,50 @@ class KnownFiles {
 RateLimiter --> peerRateLimit : contains
 BackoffTable --> operationBackoff : contains
 
+class Config {
+    + DiscoveryServiceTag: string
+    + PubsubTopicPrefix: string
+    + TopicName: string
+    + FileWatchFolder: string
+    + NodeName: string
+    + IPFSNodeAddress: string
+    + ...
+    --
+    + Sharding: ShardingConfig
+    + Replication: ReplicationConfig
+    + Files: FileConfig
+    + Security: SecurityConfig
+    + Orphan: OrphanConfig
+    --
+    + DefaultConfig(): *Config
+    + LoadFromEnv(): *Config
+    + Validate(): error
+    + Log()
+}
+
+class ShardingConfig {
+    + MaxPeersPerShard: int
+    + MinPeersPerShard: int
+    + MinPeersAcrossSiblings: int
+    + ShardMoveCooldown: time.Duration
+    + ShardOverlapDuration: time.Duration
+    + ShardPeerCheckInterval: time.Duration
+    + ShardDiscoveryInterval: time.Duration
+    + ShardSplitRebroadcastInterval: time.Duration
+    + ...
+}
+
+class ReplicationConfig {
+    + MinReplication: int
+    + MaxReplication: int
+    + CheckInterval: time.Duration
+    + AutoReplicationEnabled: bool
+    + AutoReplicationTimeout: time.Duration
+    + MaxConcurrentReplicationChecks: int
+    + ...
+}
+
+Config --> ShardingConfig
+Config --> ReplicationConfig
+
 @enduml
diff --git a/docs/file_ingestion_sequence.puml b/docs/file_ingestion_sequence.puml
index bf0c01c..438b2d7 100644
--- a/docs/file_ingestion_sequence.puml
+++ b/docs/file_ingestion_sequence.puml
@@ -14,8 +14,9 @@ participant "PubSub" as PubSub
 participant "Other Nodes" as Nodes
 
 User -> FS: Drop file into ./data
-FS -> Watcher: File created event
-Watcher -> FileOps: processNewFile(path)
+FS -> Watcher: fsnotify.Create event
+Watcher -> Watcher: handleWatcherEvent(event)\n(if directory: handleNewDirectory)
+Watcher -> FileOps: enqueue file via processNewFile(path)
 
 FileOps -> IPFS: Import file (UnixFS)\nPayloadCID
 FileOps -> FileOps: Build ResearchObject (CBOR)\n{meta_ref, ingester_id, payload, size}
diff --git a/docs/file_lifecycle_state.puml b/docs/file_lifecycle_state.puml
index 688ce97..2406d47 100644
--- a/docs/file_lifecycle_state.puml
+++ b/docs/file_lifecycle_state.puml
@@ -11,8 +11,8 @@ Custodial --> InTargetCluster: CRDT syncs; responsible nodes\nLocalPinTracker pi
 
 Responsible --> InCluster: Pin in CRDT; LocalPinTracker\nsyncs to self and peers (allocations)
 
-InCluster --> UnderReplicated: runReplicationChecker:\nallocations < MinReplication
-UnderReplicated --> InCluster: ReplicationRequest -> peers\nPin(targetShard), TriggerSync
+InCluster --> UnderReplicated: repl.runChecker():\nunder target replication
+UnderReplicated --> InCluster: ReplicationRequest -> peers\nauto-replication: PinRecursive, ClusterPinIfAbsent, TriggerSync
 
 InCluster --> Removed: RunReshardPass: we split/moved\nfile belongs to other shard
 Removed --> [*]: Unpin from cluster\nUnpinRecursive, UnpinFile
diff --git a/docs/message_flow.puml b/docs/message_flow.puml
index ae29b69..5d0fb72 100644
--- a/docs/message_flow.puml
+++ b/docs/message_flow.puml
@@ -26,21 +26,19 @@ GS -> C: Forward message
 B -> B: Add to knownFiles\nIf AmIResponsibleFor(PayloadCID): Pin to cluster, AnnouncePinned
 C -> C: Add to knownFiles\nIf responsible: Pin to cluster, AnnouncePinned
 
-== Replication Request ==
+== Replication Request (handled by replicationManager) ==
 A -> GS: ReplicationRequest(CBOR)\n{manifest_cid, priority, deadline, sender_id, ts, nonce, sig} (shard topic)
 GS -> B: Forward message
 GS -> C: Forward message
-B -> B: targetShard = targetShardForManifest(manifest, topicShard)
-B -> B: If we have file: JoinShard(targetShard), EnsureClusterForShard,\nPin(targetShard), TriggerSync(targetShard)
-alt B has file
-    B -> CRDT: Pin(targetShard) via ClusterManager
-    B -> B: TriggerSync(targetShard)
-else B does not have file
-    B -> IPFS: Fetch PinRecursive(manifestCID)
-    B -> CRDT: Pin(targetShard)
-    B -> B: TriggerSync(targetShard)
+B -> B: repl.handleRequest(msg, rr, shardID)\nverify signature, check legacy manifest
+alt B has file (isPinned)
+    B -> B: EnsureClusterForShard, TriggerSync(shardID)
+else B does not have file (auto-replication)
+    B -> IPFS: PinRecursive(manifestCID) with timeout
+    B -> CRDT: ClusterPinIfAbsent(shardID, cid)
+    B -> B: TriggerSync(shardID)
 end
-C -> C: Same logic (target shard = file's shard by PayloadCID)
+C -> C: Same logic via replicationManager
 
 == Custodial (Tourist) ==
 note over A: Local ingest; we are NOT responsible.\nTarget shard = TargetShardForPayload(PayloadCID, depth)
diff --git a/docs/replication_check_sequence.puml b/docs/replication_check_sequence.puml
index de99d4c..fddeb4d 100644
--- a/docs/replication_check_sequence.puml
+++ b/docs/replication_check_sequence.puml
@@ -43,6 +43,6 @@ ClusterMgr -> CRDT: LogUnpin(pin)
 CRDT -> Peers: Sync State
 Peers -> PinTracker: State() on next sync
 PinTracker -> Storage: onPinRemoved(cid)
-Note over PinTracker: Releases tracking only.\nActual IPFS UnpinRecursive is called\nby the reshard/orphan unpin passes\n(shard_reshard.go, shard_maintenance.go).
+Note over PinTracker: Releases tracking only.\nActual IPFS UnpinRecursive is called\nby the reshard/orphan unpin passes\n(shard_reshard.go, shard_orphan.go).
 
 @enduml
diff --git a/docs/shard_split_sequence.puml b/docs/shard_split_sequence.puml
index 9532095..4835962 100644
--- a/docs/shard_split_sequence.puml
+++ b/docs/shard_split_sequence.puml
@@ -10,7 +10,7 @@ participant "IPFS" as IPFS
 == Path 1: Split when current shard is overcrowded ==
 
 ShardMgr -> ShardMgr: runPeerCountChecker()\n(ticker: rootPeerCheckInterval at root, ShardPeerCheckInterval otherwise)
-ShardMgr -> ShardMgr: getShardPeerCountForSplit()\n(ACTIVE peers only from HEARTBEAT/JOIN; no mesh fallback)
+ShardMgr -> ShardMgr: getShardPeerCount(useMeshFallback=false)\n(ACTIVE peers only from HEARTBEAT/JOIN; no mesh fallback)
 
 alt peerCount >= MaxPeersPerShard (2 consecutive checks)\nAND estimatedPerChild >= MinPeersPerShard\nAND (child has >=1 peer OR parent >= 14 to create)\nAND time.Since(lastShardMove) >= ShardMoveCooldown (30s)
     ShardMgr -> ShardMgr: announceSplit(parentShard, targetChild)\nPublish SPLIT:child0:child1 on parent topic
diff --git a/docs/system_architecture.puml b/docs/system_architecture.puml
index 76d39fb..f179bc9 100644
--- a/docs/system_architecture.puml
+++ b/docs/system_architecture.puml
@@ -5,6 +5,8 @@
 package "D-LOCKSS Node" {
     [Main] as main
     [ShardManager] as shardMgr
+    [replicationManager] as replMgr
+    [lifecycleManager] as lifecycleMgr
     [ClusterManager] as clusterMgr
     [FileProcessor] as fileOps
     [StorageManager] as storage
@@ -37,15 +39,12 @@ package "File System" {
 package "D-LOCKSS Monitor" {
     [Dashboard Server] as dashboard
     [Subscription Manager] as subMgr
-    [GeoIP Resolver] as geoIP
     [Replication Tracker] as repTracker
 }
 
 package "External" {
     [Other D-LOCKSS Nodes] as peers
     [IPFS Network] as ipfsNet
-    [ip-api.com API] as geoAPI
-    [Local GeoIP DB (.mmdb)] as geoDB
 }
 
 main --> shardMgr : manages
@@ -56,6 +55,8 @@ main --> rateLimit : uses
 main --> discovery : initializes
 main --> signingTrust : initializes
 
+shardMgr --> replMgr : delegates replication
+shardMgr --> lifecycleMgr : delegates split/merge/discovery
 shardMgr --> clusterMgr : manages clusters
 shardMgr --> pubsub : subscribes/publishes
 shardMgr --> host : uses
@@ -104,9 +105,6 @@ watcher --> dataDir : monitors
 subMgr --> pubsub : subscribes to all shard topics
 subMgr --> dashboard : feeds node/shard state
 subMgr --> repTracker : tracks manifest replication
-geoIP --> geoDB : local lookup (preferred)
-geoIP --> geoAPI : on-demand API fallback (if no local DB)
-geoIP --> dashboard : on-demand region (identify modal)
 dashboard --> repTracker : replication data
 
 @enduml
diff --git a/go.mod b/go.mod
index 1d4caf3..421925e 100644
--- a/go.mod
+++ b/go.mod
@@ -16,8 +16,7 @@ require (
 	github.com/libp2p/go-libp2p-pubsub v0.15.0
 	github.com/multiformats/go-multiaddr v0.16.1
 	github.com/multiformats/go-multihash v0.2.3
-	github.com/oschwald/geoip2-golang v1.13.0
-	github.com/prometheus/client_golang v1.23.2
+	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58
 )
 
 require (
@@ -104,8 +103,6 @@ require (
 	github.com/multiformats/go-varint v0.1.0 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/opentracing/opentracing-go v1.2.0 // indirect
-	github.com/oschwald/maxminddb-golang v1.13.0 // indirect
-	github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 // indirect
 	github.com/pion/datachannel v1.5.10 // indirect
 	github.com/pion/dtls/v2 v2.2.12 // indirect
 	github.com/pion/dtls/v3 v3.0.6 // indirect
@@ -127,6 +124,7 @@ require (
 	github.com/pion/webrtc/v4 v4.1.2 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/polydawn/refmt v0.89.0 // indirect
+	github.com/prometheus/client_golang v1.23.2 // indirect
 	github.com/prometheus/client_model v0.6.2 // indirect
 	github.com/prometheus/common v0.66.1 // indirect
 	github.com/prometheus/procfs v0.17.0 // indirect
diff --git a/go.sum b/go.sum
index e2c6abf..c861756 100644
--- a/go.sum
+++ b/go.sum
@@ -527,10 +527,6 @@ github.com/onsi/gomega v1.36.2/go.mod h1:DdwyADRjrc825LhMEkD76cHR5+pUnjhUN8GlHlR
 github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o=
 github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs=
 github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc=
-github.com/oschwald/geoip2-golang v1.13.0 h1:Q44/Ldc703pasJeP5V9+aFSZFmBN7DKHbNsSFzQATJI=
-github.com/oschwald/geoip2-golang v1.13.0/go.mod h1:P9zG+54KPEFOliZ29i7SeYZ/GM6tfEL+rgSn03hYuUo=
-github.com/oschwald/maxminddb-golang v1.13.0 h1:R8xBorY71s84yO06NgTmQvqvTvlS/bnYZrrWX1MElnU=
-github.com/oschwald/maxminddb-golang v1.13.0/go.mod h1:BU0z8BfFVhi1LQaonTwwGQlsHUEu9pWNdMfmq4ztm0o=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 h1:onHthvaw9LFnH4t2DcNVpwGmV9E1BkGknEliJkfwQj0=
 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58/go.mod h1:DXv8WO4yhMYhSNPKjeNKa5WY9YCIEBRbNzFFPJbWO6Y=
 github.com/pion/datachannel v1.5.10 h1:ly0Q26K1i6ZkGf42W7D4hQYR90pZwzFOjTq5AuCKk4o=
diff --git a/internal/api/api.go b/internal/api/api.go
index 0aee89e..fc27142 100644
--- a/internal/api/api.go
+++ b/internal/api/api.go
@@ -2,42 +2,26 @@ package api
 
 import (
 	"context"
-	"encoding/json"
 	"fmt"
 	"log/slog"
 	"net/http"
-
-	"dlockss/internal/telemetry"
-
-	"github.com/prometheus/client_golang/prometheus/promhttp"
 )
 
 // APIServer manages the local observability API
 type APIServer struct {
-	server  *http.Server
-	metrics *telemetry.MetricsManager
+	server *http.Server
 }
 
-func NewAPIServer(port int, metrics *telemetry.MetricsManager) *APIServer {
-	s := &APIServer{
-		metrics: metrics,
-	}
+func NewAPIServer(port int) *APIServer {
+	s := &APIServer{}
 
 	mux := http.NewServeMux()
 
-	// Prometheus metrics endpoint
-	mux.Handle("/metrics", promhttp.Handler())
-
-	// Status endpoint
-	mux.HandleFunc("/status", s.handleStatus)
-
-	// Health check
 	mux.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
 		w.WriteHeader(http.StatusOK)
 		w.Write([]byte("OK"))
 	})
 
-	// Dashboard UI (simple HTML)
 	mux.HandleFunc("/", s.handleDashboard)
 
 	s.server = &http.Server{
@@ -61,18 +45,6 @@ func (s *APIServer) Shutdown(ctx context.Context) error {
 	return s.server.Shutdown(ctx)
 }
 
-func (s *APIServer) handleStatus(w http.ResponseWriter, r *http.Request) {
-	if s.metrics == nil {
-		http.Error(w, "Metrics not initialized", http.StatusInternalServerError)
-		return
-	}
-
-	status := s.metrics.GetStatus()
-
-	w.Header().Set("Content-Type", "application/json")
-	json.NewEncoder(w).Encode(status)
-}
-
 func (s *APIServer) handleDashboard(w http.ResponseWriter, r *http.Request) {
 	if r.URL.Path != "/" {
 		http.NotFound(w, r)
@@ -86,57 +58,17 @@ const dashboardHTML = `<!DOCTYPE html>
 <html>
 <head>
   <meta charset="utf-8">
-  <title>D-LOCKSS Monitor</title>
+  <title>D-LOCKSS Node</title>
   <style>
     body { font-family: system-ui, sans-serif; max-width: 720px; margin: 2rem auto; padding: 0 1rem; }
     h1 { color: #333; }
-    .card { background: #f5f5f5; border-radius: 8px; padding: 1rem; margin: 1rem 0; }
-    .card h2 { margin-top: 0; font-size: 1rem; color: #666; }
-    table { width: 100%; border-collapse: collapse; }
-    th, td { text-align: left; padding: 0.25rem 0.5rem 0.25rem 0; }
-    th { color: #666; font-weight: 500; }
     a { color: #0066cc; }
-    #status { color: #666; font-size: 0.9rem; }
-    .links { margin-top: 1.5rem; }
   </style>
 </head>
 <body>
-  <h1>D-LOCKSS Monitor</h1>
-  <p id="status">Loading status…</p>
-  <div class="card">
-    <h2>Storage</h2>
-    <table><tbody>
-      <tr><th>Pinned files</th><td id="pinned">–</td></tr>
-      <tr><th>Known files</th><td id="known">–</td></tr>
-    </tbody></table>
-  </div>
-  <div class="card">
-    <h2>Shard &amp; replication</h2>
-    <table><tbody>
-      <tr><th>Current shard</th><td id="shard">–</td></tr>
-      <tr><th>Peers in shard</th><td id="peers">–</td></tr>
-      <tr><th>Queue depth</th><td id="queue">–</td></tr>
-      <tr><th>Active workers</th><td id="workers">–</td></tr>
-      <tr><th>Uptime</th><td id="uptime">–</td></tr>
-    </tbody></table>
-  </div>
-  <div class="links">
-    <a href="/status">JSON status</a> · <a href="/metrics">Prometheus metrics</a> · <a href="/health">Health</a>
-  </div>
-  <script>
-    fetch('/status').then(r => r.json()).then(d => {
-      document.getElementById('pinned').textContent = d.storage?.pinned_files ?? '–';
-      document.getElementById('known').textContent = d.storage?.known_files ?? '–';
-      document.getElementById('shard').textContent = d.current_shard || '(none)';
-      document.getElementById('peers').textContent = d.peers_in_shard ?? '–';
-      document.getElementById('queue').textContent = d.replication?.queue_depth ?? '–';
-      document.getElementById('workers').textContent = d.replication?.active_workers ?? '–';
-      var u = d.uptime_seconds; document.getElementById('uptime').textContent = u != null ? (Math.floor(u/60) + 'm ' + Math.floor(u%60) + 's') : '–';
-      document.getElementById('status').textContent = 'Live data from this server. If this is dlockss-monitor only (no node), values are zero until you scrape a node or use the node’s API.';
-    }).catch(function() {
-      document.getElementById('status').textContent = 'Could not load /status.';
-    });
-  </script>
+  <h1>D-LOCKSS Node</h1>
+  <p>This node is running. Use the <strong>D-LOCKSS Monitor</strong> for network-wide observability.</p>
+  <p><a href="/health">Health check</a></p>
 </body>
 </html>
 `
diff --git a/internal/common/types.go b/internal/common/types.go
index 880eb66..43d77a2 100644
--- a/internal/common/types.go
+++ b/internal/common/types.go
@@ -2,7 +2,6 @@ package common
 
 import (
 	"context"
-	"crypto/rand"
 	"time"
 
 	"github.com/ipfs/go-cid"
@@ -11,15 +10,8 @@ import (
 	"dlockss/internal/syncmap"
 )
 
-func NewNonce(n int) ([]byte, error) {
-	b := make([]byte, n)
-	_, err := rand.Read(b)
-	return b, err
-}
-
 // DHTProvider abstracts the DHT operations for testing.
 type DHTProvider interface {
-	FindProvidersAsync(ctx context.Context, key cid.Cid, count int) <-chan peer.AddrInfo
 	Provide(ctx context.Context, key cid.Cid, broadcast bool) error
 	FindPeer(ctx context.Context, id peer.ID) (peer.AddrInfo, error)
 }
@@ -35,7 +27,6 @@ func NewPinnedSet() *PinnedSet {
 
 // Add pins a key (always refreshes timestamp). Returns true if key was new.
 func (ps *PinnedSet) Add(key string) bool { return ps.m.Upsert(key, time.Now()) }
-func (ps *PinnedSet) Remove(key string)   { ps.m.Delete(key) }
 func (ps *PinnedSet) Has(key string) bool { return ps.m.Has(key) }
 func (ps *PinnedSet) Size() int           { return ps.m.Len() }
 func (ps *PinnedSet) Keys() []string      { return ps.m.Keys() }
@@ -59,9 +50,6 @@ func NewKnownFiles() *KnownFiles {
 	return &KnownFiles{m: syncmap.New[string, bool]()}
 }
 
-// Add returns true if the key was new.
-func (kf *KnownFiles) Add(key string) bool  { return kf.m.SetIfAbsent(key, true) }
-func (kf *KnownFiles) Remove(key string)    { kf.m.Delete(key) }
+func (kf *KnownFiles) Add(key string)       { kf.m.SetIfAbsent(key, true) }
 func (kf *KnownFiles) Has(key string) bool  { return kf.m.Has(key) }
-func (kf *KnownFiles) Size() int            { return kf.m.Len() }
 func (kf *KnownFiles) All() map[string]bool { return kf.m.Snapshot() }
diff --git a/internal/common/types_file_state.go b/internal/common/types_file_state.go
deleted file mode 100644
index 8f54b65..0000000
--- a/internal/common/types_file_state.go
+++ /dev/null
@@ -1,51 +0,0 @@
-package common
-
-import (
-	"time"
-
-	"dlockss/internal/syncmap"
-)
-
-// CheckingFiles tracks files currently being checked for replication.
-type CheckingFiles struct {
-	m *syncmap.Map[string, bool]
-}
-
-func NewCheckingFiles() *CheckingFiles {
-	return &CheckingFiles{m: syncmap.New[string, bool]()}
-}
-
-func (cf *CheckingFiles) TryLock(key string) bool { return cf.m.SetIfAbsent(key, true) }
-func (cf *CheckingFiles) Unlock(key string)       { cf.m.Delete(key) }
-func (cf *CheckingFiles) Size() int               { return cf.m.Len() }
-
-// LastCheckTime tracks when files were last checked for replication.
-type LastCheckTime = syncmap.Map[string, time.Time]
-
-func NewLastCheckTime() *LastCheckTime {
-	return syncmap.New[string, time.Time]()
-}
-
-// RecentlyRemoved tracks files that were recently removed (for cooldown).
-type RecentlyRemoved struct {
-	m     *syncmap.Map[string, time.Time]
-	count int
-}
-
-func NewRecentlyRemoved() *RecentlyRemoved {
-	return &RecentlyRemoved{m: syncmap.New[string, time.Time]()}
-}
-
-func (rr *RecentlyRemoved) WasRemoved(key string) (time.Time, bool) { return rr.m.Get(key) }
-func (rr *RecentlyRemoved) Remove(key string)                       { rr.m.Delete(key) }
-
-func (rr *RecentlyRemoved) Record(key string) {
-	rr.m.Set(key, time.Now())
-	rr.count++
-	const pruneEveryN = 64
-	const recentlyRemovedTTL = 10 * time.Minute
-	if rr.count%pruneEveryN == 0 {
-		cutoff := time.Now().Add(-recentlyRemovedTTL)
-		rr.m.Prune(func(_ string, t time.Time) bool { return t.Before(cutoff) })
-	}
-}
diff --git a/internal/common/types_replication.go b/internal/common/types_replication.go
deleted file mode 100644
index 1019dcb..0000000
--- a/internal/common/types_replication.go
+++ /dev/null
@@ -1,10 +0,0 @@
-package common
-
-import "dlockss/internal/syncmap"
-
-// FileReplicationLevels tracks replication counts for files.
-type FileReplicationLevels = syncmap.Map[string, int]
-
-func NewFileReplicationLevels() *FileReplicationLevels {
-	return syncmap.New[string, int]()
-}
diff --git a/internal/common/types_status.go b/internal/common/types_status.go
deleted file mode 100644
index c690c91..0000000
--- a/internal/common/types_status.go
+++ /dev/null
@@ -1,37 +0,0 @@
-package common
-
-// StorageSnapshot is a point-in-time view of storage state, used by internal
-// metrics and telemetry (not the public API).
-type StorageSnapshot struct {
-	PinnedCount  int
-	KnownCount   int
-	KnownCIDs    []string
-	BackoffCount int
-}
-
-// StatusResponse defines the JSON structure for /status and monitor node views.
-type StatusResponse struct {
-	PeerID        string            `json:"peer_id"`
-	Version       string            `json:"version"`
-	CurrentShard  string            `json:"current_shard"`
-	Role          string            `json:"role,omitempty"`
-	PeersInShard  int               `json:"peers_in_shard"`
-	Storage       StorageStatus     `json:"storage"`
-	Replication   ReplicationStatus `json:"replication"`
-	UptimeSeconds float64           `json:"uptime_seconds"`
-}
-
-type StorageStatus struct {
-	PinnedFiles   int      `json:"pinned_files"`
-	PinnedInShard int      `json:"pinned_in_shard,omitempty"`
-	KnownFiles    int      `json:"known_files"`
-	KnownCIDs     []string `json:"known_cids,omitempty"`
-}
-
-type ReplicationStatus struct {
-	QueueDepth              int     `json:"queue_depth"`
-	ActiveWorkers           int     `json:"active_workers"`
-	AvgReplicationLevel     float64 `json:"avg_replication_level"`
-	FilesAtTarget           int     `json:"files_at_target"`
-	ReplicationDistribution [11]int `json:"replication_distribution"`
-}
diff --git a/internal/common/types_trust.go b/internal/common/types_trust.go
index a3f861e..36709ac 100644
--- a/internal/common/types_trust.go
+++ b/internal/common/types_trust.go
@@ -18,8 +18,6 @@ func NewTrustedPeers() *TrustedPeers {
 	return &TrustedPeers{m: syncmap.New[peer.ID, bool]()}
 }
 
-func (tp *TrustedPeers) Add(pid peer.ID)               { tp.m.Set(pid, true) }
-func (tp *TrustedPeers) Remove(pid peer.ID)            { tp.m.Delete(pid) }
 func (tp *TrustedPeers) Has(pid peer.ID) bool          { return tp.m.Has(pid) }
 func (tp *TrustedPeers) SetAll(peers map[peer.ID]bool) { tp.m.ReplaceAll(peers) }
 func (tp *TrustedPeers) All() []peer.ID                { return tp.m.Keys() }
@@ -40,7 +38,7 @@ func NewRateLimiter(window time.Duration, maxMessages int) *RateLimiter {
 	}
 }
 
-func (rl *RateLimiter) GetOrCreate(peerID peer.ID) *peerRateLimit {
+func (rl *RateLimiter) getOrCreate(peerID peer.ID) *peerRateLimit {
 	rl.mu.Lock()
 	defer rl.mu.Unlock()
 
@@ -54,44 +52,8 @@ func (rl *RateLimiter) GetOrCreate(peerID peer.ID) *peerRateLimit {
 	return prl
 }
 
-func (rl *RateLimiter) Remove(peerID peer.ID) {
-	rl.mu.Lock()
-	defer rl.mu.Unlock()
-	delete(rl.peers, peerID)
-}
-
-func (rl *RateLimiter) Size() int {
-	rl.mu.RLock()
-	defer rl.mu.RUnlock()
-	return len(rl.peers)
-}
-
-func (rl *RateLimiter) Cleanup(cutoff time.Time) int {
-	rl.mu.Lock()
-	defer rl.mu.Unlock()
-
-	removed := 0
-	for peerID, peerLimit := range rl.peers {
-		peerLimit.mu.Lock()
-		hasRecent := false
-		for _, msgTime := range peerLimit.messages {
-			if msgTime.After(cutoff) {
-				hasRecent = true
-				break
-			}
-		}
-		peerLimit.mu.Unlock()
-
-		if !hasRecent {
-			delete(rl.peers, peerID)
-			removed++
-		}
-	}
-	return removed
-}
-
 func (rl *RateLimiter) Check(peerID peer.ID) bool {
-	prl := rl.GetOrCreate(peerID)
+	prl := rl.getOrCreate(peerID)
 
 	prl.mu.Lock()
 	defer prl.mu.Unlock()
diff --git a/internal/common/utils.go b/internal/common/utils.go
index 38ab318..f802c73 100644
--- a/internal/common/utils.go
+++ b/internal/common/utils.go
@@ -13,14 +13,6 @@ import (
 	"github.com/ipfs/go-cid"
 )
 
-func ValidateHash(hash string) bool {
-	if len(hash) != 64 {
-		return false
-	}
-	_, err := hex.DecodeString(hash)
-	return err == nil
-}
-
 func GetBinaryPrefix(s string, depth int) string {
 	h := sha256.Sum256([]byte(s))
 	return bytesToBinaryString(h[:], depth)
@@ -34,10 +26,6 @@ func GetHexBinaryPrefix(hexStr string, depth int) (string, error) {
 	return bytesToBinaryString(b, depth), nil
 }
 
-func KeyToCID(key string) (cid.Cid, error) {
-	return cid.Decode(key)
-}
-
 func KeyToStableHex(key string) string {
 	sum := sha256.Sum256([]byte(key))
 	return hex.EncodeToString(sum[:])
diff --git a/internal/common/utils_test.go b/internal/common/utils_test.go
index 2a5aa00..fb0e2a6 100644
--- a/internal/common/utils_test.go
+++ b/internal/common/utils_test.go
@@ -17,63 +17,6 @@ func mustGetHexBinaryPrefix(hexStr string, depth int) string {
 	return result
 }
 
-func TestValidateHash(t *testing.T) {
-	tests := []struct {
-		name  string
-		hash  string
-		valid bool
-	}{
-		{
-			name:  "valid 64-char hex",
-			hash:  "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855",
-			valid: true,
-		},
-		{
-			name:  "valid all zeros",
-			hash:  "0000000000000000000000000000000000000000000000000000000000000000",
-			valid: true,
-		},
-		{
-			name:  "valid all f",
-			hash:  "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
-			valid: true,
-		},
-		{
-			name:  "too short",
-			hash:  "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b85",
-			valid: false,
-		},
-		{
-			name:  "too long",
-			hash:  "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b8550",
-			valid: false,
-		},
-		{
-			name:  "non-hex chars",
-			hash:  "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b85g",
-			valid: false,
-		},
-		{
-			name:  "uppercase non-hex",
-			hash:  "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b85G",
-			valid: false,
-		},
-		{
-			name:  "empty",
-			hash:  "",
-			valid: false,
-		},
-	}
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := ValidateHash(tt.hash)
-			if got != tt.valid {
-				t.Errorf("ValidateHash(%q) = %v, want %v", tt.hash, got, tt.valid)
-			}
-		})
-	}
-}
-
 func TestGetBinaryPrefix(t *testing.T) {
 	// Known SHA256 outputs for verification at various depths.
 	// SHA256("") = e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855
@@ -162,7 +105,7 @@ func TestKeyToStableHex(t *testing.T) {
 	if len(got1) != 64 {
 		t.Errorf("KeyToStableHex(%q) len = %d, want 64", key, len(got1))
 	}
-	if !ValidateHash(got1) {
+	if _, err := hex.DecodeString(got1); err != nil {
 		t.Errorf("KeyToStableHex(%q) = %q is not valid hex", key, got1)
 	}
 	// Different keys produce different output
diff --git a/internal/config/config.go b/internal/config/config.go
index 71b9fc0..a945828 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -97,80 +97,91 @@ func nodeNamePath(dataDir string) string {
 // DefaultTopicName is the archive topic when none is configured.
 const DefaultTopicName = "creative-commons"
 
-// Config holds all runtime configuration for a D-LOCKSS node.
-type Config struct {
-	DiscoveryServiceTag            string
-	PubsubTopicPrefix              string
-	TopicName                      string
-	IngestAllowlist                []string
-	FileWatchFolder                string
-	ClusterStorePath               string
+// ShardingConfig holds parameters that govern shard splitting, merging, and peer tracking.
+type ShardingConfig struct {
+	MaxPeersPerShard              int
+	MinPeersPerShard              int
+	MinPeersAcrossSiblings        int
+	ShardPeerCheckInterval        time.Duration
+	ShardDiscoveryInterval        time.Duration
+	ShardSplitRebroadcastInterval time.Duration
+	SeenPeersWindow               time.Duration
+	PruneStalePeersInterval       time.Duration
+	ShardOverlapDuration          time.Duration
+	ShardMoveCooldown             time.Duration
+	MergeUpCooldown               time.Duration
+	ProbeTimeoutMerge             time.Duration
+	SiblingEmptyMergeAfter        time.Duration
+}
+
+// ReplicationConfig holds parameters for content replication across peers.
+type ReplicationConfig struct {
 	MinReplication                 int
 	MaxReplication                 int
 	CheckInterval                  time.Duration
-	MaxPeersPerShard               int
-	MinPeersPerShard               int
-	MinPeersAcrossSiblings         int
-	ShardPeerCheckInterval         time.Duration
-	ShardDiscoveryInterval         time.Duration
-	ShardSplitRebroadcastInterval  time.Duration
-	BootstrapTimeout               time.Duration
-	SeenPeersWindow                time.Duration
-	PruneStalePeersInterval        time.Duration
 	MaxConcurrentReplicationChecks int
-	RateLimitWindow                time.Duration
-	MaxMessagesPerWindow           int
-	InitialBackoffDelay            time.Duration
-	MaxBackoffDelay                time.Duration
-	BackoffMultiplier              float64
-	MetricsReportInterval          time.Duration
-	ReplicationCheckCooldown       time.Duration
-	RemovedFileCooldown            time.Duration
-	MetricsExportPath              string
-	BadBitsPath                    string
-	ShardOverlapDuration           time.Duration
-	OrphanUnpinGracePeriod         time.Duration
-	OrphanHandoffGrace             time.Duration
-	OrphanUnpinMinHandoffCount     int
-	ReplicationVerificationDelay   time.Duration
-	DiskUsageHighWaterMark         float64
-	IPFSNodeAddress                string
-	APIPort                        int
-	TrustMode                      string
-	TrustStorePath                 string
-	SignatureMode                  string
-	SignatureMaxAge                time.Duration
-	UsePubsubForReplication        bool
-	MinShardPeersForPubsubOnly     int
-	ReplicationCacheTTL            time.Duration
 	AutoReplicationEnabled         bool
 	AutoReplicationTimeout         time.Duration
-	CRDTOpTimeout                  time.Duration
-	FileImportTimeout              time.Duration
-	DHTProvideTimeout              time.Duration
-	MaxConcurrentDHTProvides       int
-	FileProcessingDelay            time.Duration
-	FileStabilityDelay             time.Duration
-	MaxConcurrentFileProcessing    int
-	DHTQueryTimeout                time.Duration
-	ReshardDelay                   time.Duration
-	ReshardHandoffDelay            time.Duration
 	PinReannounceInterval          time.Duration
-	NonceSize                      int
-	MinNonceSize                   int
-	FutureSkewTolerance            time.Duration
-	TelemetryInterval              time.Duration
-	TelemetryIncludeCIDs           bool
-	HeartbeatInterval              time.Duration
-	VerboseLogging                 bool
-	MergeUpCooldown                time.Duration
-	ProbeTimeoutMerge              time.Duration
-	SiblingEmptyMergeAfter         time.Duration
-	ShardMoveCooldown              time.Duration
-	NodeName                       string
-	IdentityPath                   string
-	NodeNamePath                   string
-	IPFSConfigPath                 string
+}
+
+// FileConfig holds parameters for file ingestion and DHT operations.
+type FileConfig struct {
+	FileImportTimeout           time.Duration
+	DHTProvideTimeout           time.Duration
+	MaxConcurrentDHTProvides    int
+	FileProcessingDelay         time.Duration
+	FileStabilityDelay          time.Duration
+	MaxConcurrentFileProcessing int
+	ReshardDelay                time.Duration
+	ReshardHandoffDelay         time.Duration
+}
+
+// SecurityConfig holds trust, signing, and nonce parameters.
+type SecurityConfig struct {
+	TrustMode           string
+	TrustStorePath      string
+	SignatureMode       string
+	SignatureMaxAge     time.Duration
+	NonceSize           int
+	MinNonceSize        int
+	FutureSkewTolerance time.Duration
+}
+
+// OrphanConfig holds parameters for orphan file detection and cleanup.
+type OrphanConfig struct {
+	UnpinGracePeriod   time.Duration
+	HandoffGrace       time.Duration
+	UnpinMinHandoffCnt int
+}
+
+// Config holds all runtime configuration for a D-LOCKSS node.
+type Config struct {
+	DiscoveryServiceTag    string
+	PubsubTopicPrefix      string
+	TopicName              string
+	IngestAllowlist        []string
+	FileWatchFolder        string
+	ClusterStorePath       string
+	BadBitsPath            string
+	IPFSNodeAddress        string
+	APIPort                int
+	BootstrapTimeout       time.Duration
+	HeartbeatInterval      time.Duration
+	VerboseLogging         bool
+	RateLimitWindow        time.Duration
+	MaxMessagesPerWindow   int
+	DiskUsageHighWaterMark float64
+	NodeName               string
+	IdentityPath           string
+	NodeNamePath           string
+	IPFSConfigPath         string
+
+	Sharding    ShardingConfig
+	Replication ReplicationConfig
+	Files       FileConfig
+	Security    SecurityConfig
+	Orphan      OrphanConfig
 }
 
 // DefaultConfig returns a Config with all hardcoded defaults (no env reads).
@@ -178,78 +189,70 @@ type Config struct {
 func DefaultConfig() *Config {
 	dataDir := "./data"
 	return &Config{
-		DiscoveryServiceTag:            "dlockss-prod",
-		PubsubTopicPrefix:              DefaultPubsubVersion,
-		TopicName:                      DefaultTopicName,
-		IngestAllowlist:                nil,
-		FileWatchFolder:                dataDir,
-		ClusterStorePath:               filepath.Join(filepath.Dir(dataDir), "cluster_store"),
-		MinReplication:                 5,
-		MaxReplication:                 10,
-		CheckInterval:                  1 * time.Minute,
-		MaxPeersPerShard:               12,
-		MinPeersPerShard:               6,
-		MinPeersAcrossSiblings:         10,
-		ShardPeerCheckInterval:         2 * time.Minute,
-		ShardDiscoveryInterval:         2 * time.Minute,
-		ShardSplitRebroadcastInterval:  60 * time.Second,
-		BootstrapTimeout:               15 * time.Second,
-		SeenPeersWindow:                350 * time.Second,
-		PruneStalePeersInterval:        10 * time.Minute,
-		MaxConcurrentReplicationChecks: 5,
-		RateLimitWindow:                1 * time.Minute,
-		MaxMessagesPerWindow:           100,
-		InitialBackoffDelay:            5 * time.Second,
-		MaxBackoffDelay:                5 * time.Minute,
-		BackoffMultiplier:              2.0,
-		MetricsReportInterval:          5 * time.Second,
-		ReplicationCheckCooldown:       1 * time.Minute,
-		RemovedFileCooldown:            2 * time.Minute,
-		MetricsExportPath:              "",
-		BadBitsPath:                    "badBits.csv",
-		ShardOverlapDuration:           2 * time.Minute,
-		OrphanUnpinGracePeriod:         6 * time.Minute,
-		OrphanHandoffGrace:             6 * time.Minute,
-		OrphanUnpinMinHandoffCount:     2,
-		ReplicationVerificationDelay:   2 * time.Minute,
-		DiskUsageHighWaterMark:         90.0,
-		IPFSNodeAddress:                "/ip4/127.0.0.1/tcp/5001",
-		APIPort:                        5050,
-		TrustMode:                      "open",
-		TrustStorePath:                 "trusted_peers.json",
-		SignatureMode:                  "strict",
-		SignatureMaxAge:                10 * time.Minute,
-		UsePubsubForReplication:        true,
-		MinShardPeersForPubsubOnly:     5,
-		ReplicationCacheTTL:            5 * time.Minute,
-		AutoReplicationEnabled:         true,
-		AutoReplicationTimeout:         5 * time.Minute,
-		CRDTOpTimeout:                  10 * time.Minute,
-		FileImportTimeout:              2 * time.Minute,
-		DHTProvideTimeout:              60 * time.Second,
-		MaxConcurrentDHTProvides:       8,
-		FileProcessingDelay:            100 * time.Millisecond,
-		FileStabilityDelay:             3 * time.Second,
-		MaxConcurrentFileProcessing:    5,
-		DHTQueryTimeout:                2 * time.Minute,
-		ReshardDelay:                   5 * time.Second,
-		ReshardHandoffDelay:            3 * time.Second,
-		PinReannounceInterval:          2 * time.Minute,
-		NonceSize:                      16,
-		MinNonceSize:                   8,
-		FutureSkewTolerance:            30 * time.Second,
-		TelemetryInterval:              30 * time.Second,
-		TelemetryIncludeCIDs:           false,
-		HeartbeatInterval:              10 * time.Second,
-		VerboseLogging:                 false,
-		MergeUpCooldown:                2 * time.Minute,
-		ProbeTimeoutMerge:              6 * time.Second,
-		SiblingEmptyMergeAfter:         5 * time.Minute,
-		ShardMoveCooldown:              30 * time.Second,
-		NodeName:                       "",
-		IdentityPath:                   filepath.Join(filepath.Dir(dataDir), "dlockss.key"),
-		NodeNamePath:                   filepath.Join(filepath.Dir(dataDir), "node_name"),
-		IPFSConfigPath:                 "",
+		DiscoveryServiceTag:    "dlockss-prod",
+		PubsubTopicPrefix:      DefaultPubsubVersion,
+		TopicName:              DefaultTopicName,
+		FileWatchFolder:        dataDir,
+		ClusterStorePath:       filepath.Join(filepath.Dir(dataDir), "cluster_store"),
+		BadBitsPath:            "badBits.csv",
+		IPFSNodeAddress:        "/ip4/127.0.0.1/tcp/5001",
+		APIPort:                5050,
+		BootstrapTimeout:       15 * time.Second,
+		HeartbeatInterval:      10 * time.Second,
+		RateLimitWindow:        1 * time.Minute,
+		MaxMessagesPerWindow:   100,
+		DiskUsageHighWaterMark: 90.0,
+		IdentityPath:           filepath.Join(filepath.Dir(dataDir), "dlockss.key"),
+		NodeNamePath:           filepath.Join(filepath.Dir(dataDir), "node_name"),
+
+		Sharding: ShardingConfig{
+			MaxPeersPerShard:              12,
+			MinPeersPerShard:              6,
+			MinPeersAcrossSiblings:        10,
+			ShardPeerCheckInterval:        2 * time.Minute,
+			ShardDiscoveryInterval:        2 * time.Minute,
+			ShardSplitRebroadcastInterval: 60 * time.Second,
+			SeenPeersWindow:               350 * time.Second,
+			PruneStalePeersInterval:       10 * time.Minute,
+			ShardOverlapDuration:          2 * time.Minute,
+			ShardMoveCooldown:             30 * time.Second,
+			MergeUpCooldown:               2 * time.Minute,
+			ProbeTimeoutMerge:             6 * time.Second,
+			SiblingEmptyMergeAfter:        5 * time.Minute,
+		},
+		Replication: ReplicationConfig{
+			MinReplication:                 5,
+			MaxReplication:                 10,
+			CheckInterval:                  1 * time.Minute,
+			MaxConcurrentReplicationChecks: 5,
+			AutoReplicationEnabled:         true,
+			AutoReplicationTimeout:         5 * time.Minute,
+			PinReannounceInterval:          2 * time.Minute,
+		},
+		Files: FileConfig{
+			FileImportTimeout:           2 * time.Minute,
+			DHTProvideTimeout:           60 * time.Second,
+			MaxConcurrentDHTProvides:    8,
+			FileProcessingDelay:         100 * time.Millisecond,
+			FileStabilityDelay:          3 * time.Second,
+			MaxConcurrentFileProcessing: 5,
+			ReshardDelay:                5 * time.Second,
+			ReshardHandoffDelay:         3 * time.Second,
+		},
+		Security: SecurityConfig{
+			TrustMode:           "open",
+			TrustStorePath:      "trusted_peers.json",
+			SignatureMode:       "strict",
+			SignatureMaxAge:     10 * time.Minute,
+			NonceSize:           16,
+			MinNonceSize:        8,
+			FutureSkewTolerance: 30 * time.Second,
+		},
+		Orphan: OrphanConfig{
+			UnpinGracePeriod:   6 * time.Minute,
+			HandoffGrace:       6 * time.Minute,
+			UnpinMinHandoffCnt: 2,
+		},
 	}
 }
 
@@ -257,107 +260,105 @@ func DefaultConfig() *Config {
 // to hardcoded defaults for any variable that is not set.
 func LoadFromEnv() *Config {
 	dataDir := getEnvString("DLOCKSS_DATA_DIR", "./data")
-	return &Config{
-		DiscoveryServiceTag:            getEnvString("DLOCKSS_DISCOVERY_TAG", "dlockss-prod"),
-		PubsubTopicPrefix:              getEnvString("DLOCKSS_PUBSUB_TOPIC_PREFIX", DefaultPubsubVersion),
-		TopicName:                      getEnvString("DLOCKSS_TOPIC_NAME", DefaultTopicName),
-		IngestAllowlist:                getEnvStringSlice("DLOCKSS_INGEST_ALLOWLIST"),
-		FileWatchFolder:                dataDir,
-		ClusterStorePath:               clusterStorePath(dataDir),
-		MinReplication:                 getEnvInt("DLOCKSS_MIN_REPLICATION", 5),
-		MaxReplication:                 getEnvInt("DLOCKSS_MAX_REPLICATION", 10),
-		CheckInterval:                  getEnvDuration("DLOCKSS_CHECK_INTERVAL", 1*time.Minute),
-		MaxPeersPerShard:               getEnvInt("DLOCKSS_MAX_PEERS_PER_SHARD", 12),
-		MinPeersPerShard:               getEnvInt("DLOCKSS_MIN_PEERS_PER_SHARD", 6),
-		MinPeersAcrossSiblings:         getEnvInt("DLOCKSS_MIN_PEERS_ACROSS_SIBLINGS", 10),
-		ShardPeerCheckInterval:         getEnvDuration("DLOCKSS_SHARD_PEER_CHECK_INTERVAL", 2*time.Minute),
-		ShardDiscoveryInterval:         getEnvDuration("DLOCKSS_SHARD_DISCOVERY_INTERVAL", 2*time.Minute),
-		ShardSplitRebroadcastInterval:  getEnvDuration("DLOCKSS_SHARD_SPLIT_REBROADCAST_INTERVAL", 60*time.Second),
-		BootstrapTimeout:               getEnvDuration("DLOCKSS_BOOTSTRAP_TIMEOUT", 15*time.Second),
-		SeenPeersWindow:                getEnvDuration("DLOCKSS_SEEN_PEERS_WINDOW", 350*time.Second),
-		PruneStalePeersInterval:        getEnvDuration("DLOCKSS_PRUNE_STALE_PEERS_INTERVAL", 10*time.Minute),
-		MaxConcurrentReplicationChecks: getEnvInt("DLOCKSS_MAX_CONCURRENT_CHECKS", 5),
-		RateLimitWindow:                getEnvDuration("DLOCKSS_RATE_LIMIT_WINDOW", 1*time.Minute),
-		MaxMessagesPerWindow:           getEnvInt("DLOCKSS_MAX_MESSAGES_PER_WINDOW", 100),
-		InitialBackoffDelay:            getEnvDuration("DLOCKSS_INITIAL_BACKOFF", 5*time.Second),
-		MaxBackoffDelay:                getEnvDuration("DLOCKSS_MAX_BACKOFF", 5*time.Minute),
-		BackoffMultiplier:              getEnvFloat("DLOCKSS_BACKOFF_MULTIPLIER", 2.0),
-		MetricsReportInterval:          getEnvDuration("DLOCKSS_METRICS_INTERVAL", 5*time.Second),
-		ReplicationCheckCooldown:       getEnvDuration("DLOCKSS_REPLICATION_COOLDOWN", 1*time.Minute),
-		RemovedFileCooldown:            getEnvDuration("DLOCKSS_REMOVED_COOLDOWN", 2*time.Minute),
-		MetricsExportPath:              getEnvString("DLOCKSS_METRICS_EXPORT", ""),
-		BadBitsPath:                    getEnvString("DLOCKSS_BADBITS_PATH", "badBits.csv"),
-		ShardOverlapDuration:           getEnvDuration("DLOCKSS_SHARD_OVERLAP_DURATION", 2*time.Minute),
-		OrphanUnpinGracePeriod:         getEnvDuration("DLOCKSS_ORPHAN_UNPIN_GRACE", 6*time.Minute),
-		OrphanHandoffGrace:             getEnvDuration("DLOCKSS_ORPHAN_HANDOFF_GRACE", 6*time.Minute),
-		OrphanUnpinMinHandoffCount:     getEnvInt("DLOCKSS_ORPHAN_MIN_HANDOFF_COUNT", 2),
-		ReplicationVerificationDelay:   getEnvDuration("DLOCKSS_REPLICATION_VERIFICATION_DELAY", 2*time.Minute),
-		DiskUsageHighWaterMark:         getEnvFloat("DLOCKSS_DISK_USAGE_HIGH_WATER_MARK", 90.0),
-		IPFSNodeAddress:                getEnvString("DLOCKSS_IPFS_NODE", "/ip4/127.0.0.1/tcp/5001"),
-		APIPort:                        getEnvInt("DLOCKSS_API_PORT", 5050),
-		TrustMode:                      getEnvString("DLOCKSS_TRUST_MODE", "open"),
-		TrustStorePath:                 getEnvString("DLOCKSS_TRUST_STORE", "trusted_peers.json"),
-		SignatureMode:                  getEnvString("DLOCKSS_SIGNATURE_MODE", "strict"),
-		SignatureMaxAge:                getEnvDuration("DLOCKSS_SIGNATURE_MAX_AGE", 10*time.Minute),
-		UsePubsubForReplication:        getEnvBool("DLOCKSS_USE_PUBSUB_FOR_REPLICATION", true),
-		MinShardPeersForPubsubOnly:     getEnvInt("DLOCKSS_MIN_SHARD_PEERS_PUBSUB_ONLY", 5),
-		ReplicationCacheTTL:            getEnvDuration("DLOCKSS_REPLICATION_CACHE_TTL", 5*time.Minute),
-		AutoReplicationEnabled:         getEnvBool("DLOCKSS_AUTO_REPLICATION_ENABLED", true),
-		AutoReplicationTimeout:         getEnvDuration("DLOCKSS_AUTO_REPLICATION_TIMEOUT", 5*time.Minute),
-		CRDTOpTimeout:                  getEnvDuration("DLOCKSS_CRDT_OP_TIMEOUT", 10*time.Minute),
-		FileImportTimeout:              getEnvDuration("DLOCKSS_FILE_IMPORT_TIMEOUT", 2*time.Minute),
-		DHTProvideTimeout:              getEnvDuration("DLOCKSS_DHT_PROVIDE_TIMEOUT", 60*time.Second),
-		MaxConcurrentDHTProvides:       getEnvInt("DLOCKSS_MAX_CONCURRENT_DHT_PROVIDES", 8),
-		FileProcessingDelay:            getEnvDuration("DLOCKSS_FILE_PROCESSING_DELAY", 100*time.Millisecond),
-		FileStabilityDelay:             getEnvDuration("DLOCKSS_FILE_STABILITY_DELAY", 3*time.Second),
-		MaxConcurrentFileProcessing:    getEnvInt("DLOCKSS_MAX_CONCURRENT_FILE_PROCESSING", 5),
-		DHTQueryTimeout:                getEnvDuration("DLOCKSS_DHT_QUERY_TIMEOUT", 2*time.Minute),
-		ReshardDelay:                   getEnvDuration("DLOCKSS_RESHARD_DELAY", 5*time.Second),
-		ReshardHandoffDelay:            getEnvDuration("DLOCKSS_RESHARD_HANDOFF_DELAY", 3*time.Second),
-		PinReannounceInterval:          getEnvDuration("DLOCKSS_PIN_REANNOUNCE_INTERVAL", 2*time.Minute),
-		NonceSize:                      getEnvInt("DLOCKSS_NONCE_SIZE", 16),
-		MinNonceSize:                   getEnvInt("DLOCKSS_MIN_NONCE_SIZE", 8),
-		FutureSkewTolerance:            getEnvDuration("DLOCKSS_FUTURE_SKEW_TOLERANCE", 30*time.Second),
-		TelemetryInterval:              getEnvDuration("DLOCKSS_TELEMETRY_INTERVAL", 30*time.Second),
-		TelemetryIncludeCIDs:           getEnvBool("DLOCKSS_TELEMETRY_INCLUDE_CIDS", false),
-		HeartbeatInterval:              getEnvDuration("DLOCKSS_HEARTBEAT_INTERVAL", 10*time.Second),
-		VerboseLogging:                 getEnvBool("DLOCKSS_VERBOSE_LOGGING", false),
-		MergeUpCooldown:                getEnvDuration("DLOCKSS_MERGE_UP_COOLDOWN", 2*time.Minute),
-		ProbeTimeoutMerge:              getEnvDuration("DLOCKSS_PROBE_TIMEOUT_MERGE", 6*time.Second),
-		SiblingEmptyMergeAfter:         getEnvDuration("DLOCKSS_SIBLING_EMPTY_MERGE_AFTER", 5*time.Minute),
-		ShardMoveCooldown:              getEnvDuration("DLOCKSS_SHARD_MOVE_COOLDOWN", 30*time.Second),
-		NodeName:                       getEnvString("DLOCKSS_NODE_NAME", ""),
-		IdentityPath:                   identityPath(dataDir),
-		NodeNamePath:                   nodeNamePath(dataDir),
-		IPFSConfigPath:                 getEnvString("DLOCKSS_IPFS_CONFIG", ""),
-	}
+
+	cfg := DefaultConfig()
+
+	cfg.DiscoveryServiceTag = getEnvString("DLOCKSS_DISCOVERY_TAG", cfg.DiscoveryServiceTag)
+	cfg.PubsubTopicPrefix = getEnvString("DLOCKSS_PUBSUB_TOPIC_PREFIX", cfg.PubsubTopicPrefix)
+	cfg.TopicName = getEnvString("DLOCKSS_TOPIC_NAME", cfg.TopicName)
+	cfg.IngestAllowlist = getEnvStringSlice("DLOCKSS_INGEST_ALLOWLIST")
+	cfg.FileWatchFolder = dataDir
+	cfg.ClusterStorePath = clusterStorePath(dataDir)
+	cfg.BadBitsPath = getEnvString("DLOCKSS_BADBITS_PATH", cfg.BadBitsPath)
+	cfg.IPFSNodeAddress = getEnvString("DLOCKSS_IPFS_NODE", cfg.IPFSNodeAddress)
+	cfg.APIPort = getEnvInt("DLOCKSS_API_PORT", cfg.APIPort)
+	cfg.BootstrapTimeout = getEnvDuration("DLOCKSS_BOOTSTRAP_TIMEOUT", cfg.BootstrapTimeout)
+	cfg.HeartbeatInterval = getEnvDuration("DLOCKSS_HEARTBEAT_INTERVAL", cfg.HeartbeatInterval)
+	cfg.VerboseLogging = getEnvBool("DLOCKSS_VERBOSE_LOGGING", cfg.VerboseLogging)
+	cfg.RateLimitWindow = getEnvDuration("DLOCKSS_RATE_LIMIT_WINDOW", cfg.RateLimitWindow)
+	cfg.MaxMessagesPerWindow = getEnvInt("DLOCKSS_MAX_MESSAGES_PER_WINDOW", cfg.MaxMessagesPerWindow)
+	cfg.DiskUsageHighWaterMark = getEnvFloat("DLOCKSS_DISK_USAGE_HIGH_WATER_MARK", cfg.DiskUsageHighWaterMark)
+	cfg.NodeName = getEnvString("DLOCKSS_NODE_NAME", cfg.NodeName)
+	cfg.IdentityPath = identityPath(dataDir)
+	cfg.NodeNamePath = nodeNamePath(dataDir)
+	cfg.IPFSConfigPath = getEnvString("DLOCKSS_IPFS_CONFIG", cfg.IPFSConfigPath)
+
+	// Sharding
+	cfg.Sharding.MaxPeersPerShard = getEnvInt("DLOCKSS_MAX_PEERS_PER_SHARD", cfg.Sharding.MaxPeersPerShard)
+	cfg.Sharding.MinPeersPerShard = getEnvInt("DLOCKSS_MIN_PEERS_PER_SHARD", cfg.Sharding.MinPeersPerShard)
+	cfg.Sharding.MinPeersAcrossSiblings = getEnvInt("DLOCKSS_MIN_PEERS_ACROSS_SIBLINGS", cfg.Sharding.MinPeersAcrossSiblings)
+	cfg.Sharding.ShardPeerCheckInterval = getEnvDuration("DLOCKSS_SHARD_PEER_CHECK_INTERVAL", cfg.Sharding.ShardPeerCheckInterval)
+	cfg.Sharding.ShardDiscoveryInterval = getEnvDuration("DLOCKSS_SHARD_DISCOVERY_INTERVAL", cfg.Sharding.ShardDiscoveryInterval)
+	cfg.Sharding.ShardSplitRebroadcastInterval = getEnvDuration("DLOCKSS_SHARD_SPLIT_REBROADCAST_INTERVAL", cfg.Sharding.ShardSplitRebroadcastInterval)
+	cfg.Sharding.SeenPeersWindow = getEnvDuration("DLOCKSS_SEEN_PEERS_WINDOW", cfg.Sharding.SeenPeersWindow)
+	cfg.Sharding.PruneStalePeersInterval = getEnvDuration("DLOCKSS_PRUNE_STALE_PEERS_INTERVAL", cfg.Sharding.PruneStalePeersInterval)
+	cfg.Sharding.ShardOverlapDuration = getEnvDuration("DLOCKSS_SHARD_OVERLAP_DURATION", cfg.Sharding.ShardOverlapDuration)
+	cfg.Sharding.ShardMoveCooldown = getEnvDuration("DLOCKSS_SHARD_MOVE_COOLDOWN", cfg.Sharding.ShardMoveCooldown)
+	cfg.Sharding.MergeUpCooldown = getEnvDuration("DLOCKSS_MERGE_UP_COOLDOWN", cfg.Sharding.MergeUpCooldown)
+	cfg.Sharding.ProbeTimeoutMerge = getEnvDuration("DLOCKSS_PROBE_TIMEOUT_MERGE", cfg.Sharding.ProbeTimeoutMerge)
+	cfg.Sharding.SiblingEmptyMergeAfter = getEnvDuration("DLOCKSS_SIBLING_EMPTY_MERGE_AFTER", cfg.Sharding.SiblingEmptyMergeAfter)
+
+	// Replication
+	cfg.Replication.MinReplication = getEnvInt("DLOCKSS_MIN_REPLICATION", cfg.Replication.MinReplication)
+	cfg.Replication.MaxReplication = getEnvInt("DLOCKSS_MAX_REPLICATION", cfg.Replication.MaxReplication)
+	cfg.Replication.CheckInterval = getEnvDuration("DLOCKSS_CHECK_INTERVAL", cfg.Replication.CheckInterval)
+	cfg.Replication.MaxConcurrentReplicationChecks = getEnvInt("DLOCKSS_MAX_CONCURRENT_CHECKS", cfg.Replication.MaxConcurrentReplicationChecks)
+	cfg.Replication.AutoReplicationEnabled = getEnvBool("DLOCKSS_AUTO_REPLICATION_ENABLED", cfg.Replication.AutoReplicationEnabled)
+	cfg.Replication.AutoReplicationTimeout = getEnvDuration("DLOCKSS_AUTO_REPLICATION_TIMEOUT", cfg.Replication.AutoReplicationTimeout)
+	cfg.Replication.PinReannounceInterval = getEnvDuration("DLOCKSS_PIN_REANNOUNCE_INTERVAL", cfg.Replication.PinReannounceInterval)
+
+	// Files
+	cfg.Files.FileImportTimeout = getEnvDuration("DLOCKSS_FILE_IMPORT_TIMEOUT", cfg.Files.FileImportTimeout)
+	cfg.Files.DHTProvideTimeout = getEnvDuration("DLOCKSS_DHT_PROVIDE_TIMEOUT", cfg.Files.DHTProvideTimeout)
+	cfg.Files.MaxConcurrentDHTProvides = getEnvInt("DLOCKSS_MAX_CONCURRENT_DHT_PROVIDES", cfg.Files.MaxConcurrentDHTProvides)
+	cfg.Files.FileProcessingDelay = getEnvDuration("DLOCKSS_FILE_PROCESSING_DELAY", cfg.Files.FileProcessingDelay)
+	cfg.Files.FileStabilityDelay = getEnvDuration("DLOCKSS_FILE_STABILITY_DELAY", cfg.Files.FileStabilityDelay)
+	cfg.Files.MaxConcurrentFileProcessing = getEnvInt("DLOCKSS_MAX_CONCURRENT_FILE_PROCESSING", cfg.Files.MaxConcurrentFileProcessing)
+	cfg.Files.ReshardDelay = getEnvDuration("DLOCKSS_RESHARD_DELAY", cfg.Files.ReshardDelay)
+	cfg.Files.ReshardHandoffDelay = getEnvDuration("DLOCKSS_RESHARD_HANDOFF_DELAY", cfg.Files.ReshardHandoffDelay)
+
+	// Security
+	cfg.Security.TrustMode = getEnvString("DLOCKSS_TRUST_MODE", cfg.Security.TrustMode)
+	cfg.Security.TrustStorePath = getEnvString("DLOCKSS_TRUST_STORE", cfg.Security.TrustStorePath)
+	cfg.Security.SignatureMode = getEnvString("DLOCKSS_SIGNATURE_MODE", cfg.Security.SignatureMode)
+	cfg.Security.SignatureMaxAge = getEnvDuration("DLOCKSS_SIGNATURE_MAX_AGE", cfg.Security.SignatureMaxAge)
+	cfg.Security.NonceSize = getEnvInt("DLOCKSS_NONCE_SIZE", cfg.Security.NonceSize)
+	cfg.Security.MinNonceSize = getEnvInt("DLOCKSS_MIN_NONCE_SIZE", cfg.Security.MinNonceSize)
+	cfg.Security.FutureSkewTolerance = getEnvDuration("DLOCKSS_FUTURE_SKEW_TOLERANCE", cfg.Security.FutureSkewTolerance)
+
+	// Orphan
+	cfg.Orphan.UnpinGracePeriod = getEnvDuration("DLOCKSS_ORPHAN_UNPIN_GRACE", cfg.Orphan.UnpinGracePeriod)
+	cfg.Orphan.HandoffGrace = getEnvDuration("DLOCKSS_ORPHAN_HANDOFF_GRACE", cfg.Orphan.HandoffGrace)
+	cfg.Orphan.UnpinMinHandoffCnt = getEnvInt("DLOCKSS_ORPHAN_MIN_HANDOFF_COUNT", cfg.Orphan.UnpinMinHandoffCnt)
+
+	return cfg
 }
 
 // Validate checks and corrects invalid configuration values.
 func (c *Config) Validate() {
-	if c.SignatureMode != "off" && c.SignatureMode != "warn" && c.SignatureMode != "strict" {
-		slog.Warn("unknown signature mode, defaulting to strict", "mode", c.SignatureMode)
-		c.SignatureMode = "strict"
+	if c.Security.SignatureMode != "off" && c.Security.SignatureMode != "warn" && c.Security.SignatureMode != "strict" {
+		slog.Warn("unknown signature mode, defaulting to strict", "mode", c.Security.SignatureMode)
+		c.Security.SignatureMode = "strict"
 	}
-	if c.MaxConcurrentFileProcessing < 1 {
-		slog.Warn("invalid config value, using default", "key", "MaxConcurrentFileProcessing", "value", c.MaxConcurrentFileProcessing, "default", 5)
-		c.MaxConcurrentFileProcessing = 5
+	if c.Files.MaxConcurrentFileProcessing < 1 {
+		slog.Warn("invalid config value, using default", "key", "MaxConcurrentFileProcessing", "value", c.Files.MaxConcurrentFileProcessing, "default", 5)
+		c.Files.MaxConcurrentFileProcessing = 5
 	}
-	if c.NonceSize < 1 {
-		slog.Warn("invalid config value, using default", "key", "NonceSize", "value", c.NonceSize, "default", 16)
-		c.NonceSize = 16
+	if c.Security.NonceSize < 1 {
+		slog.Warn("invalid config value, using default", "key", "NonceSize", "value", c.Security.NonceSize, "default", 16)
+		c.Security.NonceSize = 16
 	}
-	if c.MinNonceSize < 1 {
-		slog.Warn("invalid config value, using default", "key", "MinNonceSize", "value", c.MinNonceSize, "default", 8)
-		c.MinNonceSize = 8
+	if c.Security.MinNonceSize < 1 {
+		slog.Warn("invalid config value, using default", "key", "MinNonceSize", "value", c.Security.MinNonceSize, "default", 8)
+		c.Security.MinNonceSize = 8
 	}
-	if c.MinReplication > c.MaxReplication {
-		slog.Warn("MinReplication > MaxReplication, swapping", "min", c.MinReplication, "max", c.MaxReplication)
-		c.MinReplication, c.MaxReplication = c.MaxReplication, c.MinReplication
+	if c.Replication.MinReplication > c.Replication.MaxReplication {
+		slog.Warn("MinReplication > MaxReplication, swapping", "min", c.Replication.MinReplication, "max", c.Replication.MaxReplication)
+		c.Replication.MinReplication, c.Replication.MaxReplication = c.Replication.MaxReplication, c.Replication.MinReplication
 	}
-	if c.MaxConcurrentReplicationChecks < 1 {
-		slog.Warn("invalid config value, using default", "key", "MaxConcurrentReplicationChecks", "value", c.MaxConcurrentReplicationChecks, "default", 5)
-		c.MaxConcurrentReplicationChecks = 5
+	if c.Replication.MaxConcurrentReplicationChecks < 1 {
+		slog.Warn("invalid config value, using default", "key", "MaxConcurrentReplicationChecks", "value", c.Replication.MaxConcurrentReplicationChecks, "default", 5)
+		c.Replication.MaxConcurrentReplicationChecks = 5
 	}
 	if c.DiskUsageHighWaterMark <= 0 || c.DiskUsageHighWaterMark > 100 {
 		slog.Warn("disk usage high water mark out of range, using default", "value", c.DiskUsageHighWaterMark, "default", 90.0)
@@ -414,72 +415,60 @@ func (c *Config) Log() {
 		"cluster_store", c.ClusterStorePath,
 		"identity", c.IdentityPath,
 		"badbits", c.BadBitsPath,
-		"trust_store", c.TrustStorePath,
-		"metrics_export", c.MetricsExportPath,
+		"trust_store", c.Security.TrustStorePath,
 	)
 	slog.Info("config: sharding",
-		"max_peers", c.MaxPeersPerShard,
-		"min_peers", c.MinPeersPerShard,
-		"min_across_siblings", c.MinPeersAcrossSiblings,
-		"peer_check_interval", c.ShardPeerCheckInterval,
-		"discovery_interval", c.ShardDiscoveryInterval,
-		"split_rebroadcast", c.ShardSplitRebroadcastInterval,
-		"seen_peers_window", c.SeenPeersWindow,
-		"prune_stale_interval", c.PruneStalePeersInterval,
-		"overlap_duration", c.ShardOverlapDuration,
-		"move_cooldown", c.ShardMoveCooldown,
-		"merge_up_cooldown", c.MergeUpCooldown,
-		"probe_timeout_merge", c.ProbeTimeoutMerge,
-		"sibling_empty_merge_after", c.SiblingEmptyMergeAfter,
+		"max_peers", c.Sharding.MaxPeersPerShard,
+		"min_peers", c.Sharding.MinPeersPerShard,
+		"min_across_siblings", c.Sharding.MinPeersAcrossSiblings,
+		"peer_check_interval", c.Sharding.ShardPeerCheckInterval,
+		"discovery_interval", c.Sharding.ShardDiscoveryInterval,
+		"split_rebroadcast", c.Sharding.ShardSplitRebroadcastInterval,
+		"seen_peers_window", c.Sharding.SeenPeersWindow,
+		"prune_stale_interval", c.Sharding.PruneStalePeersInterval,
+		"overlap_duration", c.Sharding.ShardOverlapDuration,
+		"move_cooldown", c.Sharding.ShardMoveCooldown,
+		"merge_up_cooldown", c.Sharding.MergeUpCooldown,
+		"probe_timeout_merge", c.Sharding.ProbeTimeoutMerge,
+		"sibling_empty_merge_after", c.Sharding.SiblingEmptyMergeAfter,
 	)
 	slog.Info("config: replication",
-		"min", c.MinReplication,
-		"max", c.MaxReplication,
-		"check_interval", c.CheckInterval,
-		"max_concurrent_checks", c.MaxConcurrentReplicationChecks,
-		"cooldown", c.ReplicationCheckCooldown,
-		"removed_cooldown", c.RemovedFileCooldown,
-		"verification_delay", c.ReplicationVerificationDelay,
-		"use_pubsub", c.UsePubsubForReplication,
-		"min_pubsub_peers", c.MinShardPeersForPubsubOnly,
-		"cache_ttl", c.ReplicationCacheTTL,
-		"auto_enabled", c.AutoReplicationEnabled,
-		"auto_timeout", c.AutoReplicationTimeout,
-		"crdt_op_timeout", c.CRDTOpTimeout,
-		"pin_reannounce", c.PinReannounceInterval,
+		"min", c.Replication.MinReplication,
+		"max", c.Replication.MaxReplication,
+		"check_interval", c.Replication.CheckInterval,
+		"max_concurrent_checks", c.Replication.MaxConcurrentReplicationChecks,
+		"auto_enabled", c.Replication.AutoReplicationEnabled,
+		"auto_timeout", c.Replication.AutoReplicationTimeout,
+		"pin_reannounce", c.Replication.PinReannounceInterval,
 	)
 	slog.Info("config: files",
-		"import_timeout", c.FileImportTimeout,
-		"dht_provide_timeout", c.DHTProvideTimeout,
-		"max_concurrent_dht_provides", c.MaxConcurrentDHTProvides,
-		"processing_delay", c.FileProcessingDelay,
-		"stability_delay", c.FileStabilityDelay,
-		"max_concurrent", c.MaxConcurrentFileProcessing,
-		"dht_query_timeout", c.DHTQueryTimeout,
-		"reshard_delay", c.ReshardDelay,
-		"reshard_handoff_delay", c.ReshardHandoffDelay,
+		"import_timeout", c.Files.FileImportTimeout,
+		"dht_provide_timeout", c.Files.DHTProvideTimeout,
+		"max_concurrent_dht_provides", c.Files.MaxConcurrentDHTProvides,
+		"processing_delay", c.Files.FileProcessingDelay,
+		"stability_delay", c.Files.FileStabilityDelay,
+		"max_concurrent", c.Files.MaxConcurrentFileProcessing,
+		"reshard_delay", c.Files.ReshardDelay,
+		"reshard_handoff_delay", c.Files.ReshardHandoffDelay,
 	)
 	slog.Info("config: orphan",
-		"unpin_grace", c.OrphanUnpinGracePeriod,
-		"handoff_grace", c.OrphanHandoffGrace,
-		"min_handoff_count", c.OrphanUnpinMinHandoffCount,
+		"unpin_grace", c.Orphan.UnpinGracePeriod,
+		"handoff_grace", c.Orphan.HandoffGrace,
+		"min_handoff_count", c.Orphan.UnpinMinHandoffCnt,
 	)
 	slog.Info("config: security",
-		"trust_mode", c.TrustMode,
-		"signature_mode", c.SignatureMode,
-		"signature_max_age", c.SignatureMaxAge,
-		"nonce_size", c.NonceSize,
-		"min_nonce_size", c.MinNonceSize,
-		"future_skew_tolerance", c.FutureSkewTolerance,
+		"trust_mode", c.Security.TrustMode,
+		"signature_mode", c.Security.SignatureMode,
+		"signature_max_age", c.Security.SignatureMaxAge,
+		"nonce_size", c.Security.NonceSize,
+		"min_nonce_size", c.Security.MinNonceSize,
+		"future_skew_tolerance", c.Security.FutureSkewTolerance,
 	)
 	heartbeat := "auto"
 	if c.HeartbeatInterval > 0 {
 		heartbeat = c.HeartbeatInterval.String()
 	}
-	slog.Info("config: telemetry",
-		"metrics_interval", c.MetricsReportInterval,
-		"telemetry_interval", c.TelemetryInterval,
-		"include_cids", c.TelemetryIncludeCIDs,
+	slog.Info("config: heartbeat",
 		"heartbeat_interval", heartbeat,
 		"verbose", c.VerboseLogging,
 	)
@@ -487,11 +476,6 @@ func (c *Config) Log() {
 		"window", c.RateLimitWindow,
 		"max_messages", c.MaxMessagesPerWindow,
 	)
-	slog.Info("config: backoff",
-		"initial", c.InitialBackoffDelay,
-		"max", c.MaxBackoffDelay,
-		"multiplier", c.BackoffMultiplier,
-	)
 	slog.Info("config: storage",
 		"disk_high_water_mark", c.DiskUsageHighWaterMark,
 	)
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index 04b10ab..476d891 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -12,7 +12,6 @@ func TestDefaultConfig(t *testing.T) {
 		t.Fatal("DefaultConfig returned nil")
 	}
 
-	// Key string fields
 	if cfg.DiscoveryServiceTag != "dlockss-prod" {
 		t.Errorf("DiscoveryServiceTag = %q, want dlockss-prod", cfg.DiscoveryServiceTag)
 	}
@@ -25,62 +24,48 @@ func TestDefaultConfig(t *testing.T) {
 	if cfg.BadBitsPath != "badBits.csv" {
 		t.Errorf("BadBitsPath = %q, want badBits.csv", cfg.BadBitsPath)
 	}
-	if cfg.TrustMode != "open" {
-		t.Errorf("TrustMode = %q, want open", cfg.TrustMode)
+	if cfg.Security.TrustMode != "open" {
+		t.Errorf("TrustMode = %q, want open", cfg.Security.TrustMode)
 	}
-	if cfg.SignatureMode != "strict" {
-		t.Errorf("SignatureMode = %q, want strict", cfg.SignatureMode)
+	if cfg.Security.SignatureMode != "strict" {
+		t.Errorf("SignatureMode = %q, want strict", cfg.Security.SignatureMode)
 	}
 	if cfg.IPFSNodeAddress != "/ip4/127.0.0.1/tcp/5001" {
 		t.Errorf("IPFSNodeAddress = %q, want /ip4/127.0.0.1/tcp/5001", cfg.IPFSNodeAddress)
 	}
-
-	// Key numeric fields
-	if cfg.MinReplication != 5 {
-		t.Errorf("MinReplication = %d, want 5", cfg.MinReplication)
+	if cfg.Replication.MinReplication != 5 {
+		t.Errorf("MinReplication = %d, want 5", cfg.Replication.MinReplication)
 	}
-	if cfg.MaxReplication != 10 {
-		t.Errorf("MaxReplication = %d, want 10", cfg.MaxReplication)
+	if cfg.Replication.MaxReplication != 10 {
+		t.Errorf("MaxReplication = %d, want 10", cfg.Replication.MaxReplication)
 	}
 	if cfg.APIPort != 5050 {
 		t.Errorf("APIPort = %d, want 5050", cfg.APIPort)
 	}
-	if cfg.MaxPeersPerShard != 12 {
-		t.Errorf("MaxPeersPerShard = %d, want 12", cfg.MaxPeersPerShard)
+	if cfg.Sharding.MaxPeersPerShard != 12 {
+		t.Errorf("MaxPeersPerShard = %d, want 12", cfg.Sharding.MaxPeersPerShard)
 	}
-	if cfg.MinPeersPerShard != 6 {
-		t.Errorf("MinPeersPerShard = %d, want 6", cfg.MinPeersPerShard)
+	if cfg.Sharding.MinPeersPerShard != 6 {
+		t.Errorf("MinPeersPerShard = %d, want 6", cfg.Sharding.MinPeersPerShard)
 	}
 	if cfg.DiskUsageHighWaterMark != 90.0 {
 		t.Errorf("DiskUsageHighWaterMark = %f, want 90.0", cfg.DiskUsageHighWaterMark)
 	}
-	if cfg.BackoffMultiplier != 2.0 {
-		t.Errorf("BackoffMultiplier = %f, want 2.0", cfg.BackoffMultiplier)
-	}
-
-	// Key duration fields (non-zero)
-	if cfg.CheckInterval != 1*time.Minute {
-		t.Errorf("CheckInterval = %v, want 1m", cfg.CheckInterval)
+	if cfg.Replication.CheckInterval != 1*time.Minute {
+		t.Errorf("CheckInterval = %v, want 1m", cfg.Replication.CheckInterval)
 	}
 	if cfg.BootstrapTimeout != 15*time.Second {
 		t.Errorf("BootstrapTimeout = %v, want 15s", cfg.BootstrapTimeout)
 	}
-	if cfg.SignatureMaxAge != 10*time.Minute {
-		t.Errorf("SignatureMaxAge = %v, want 10m", cfg.SignatureMaxAge)
-	}
-
-	// Key bool fields
-	if !cfg.UsePubsubForReplication {
-		t.Error("UsePubsubForReplication = false, want true")
+	if cfg.Security.SignatureMaxAge != 10*time.Minute {
+		t.Errorf("SignatureMaxAge = %v, want 10m", cfg.Security.SignatureMaxAge)
 	}
-	if !cfg.AutoReplicationEnabled {
+	if !cfg.Replication.AutoReplicationEnabled {
 		t.Error("AutoReplicationEnabled = false, want true")
 	}
 	if cfg.VerboseLogging {
 		t.Error("VerboseLogging = true, want false")
 	}
-
-	// Path fields derived from data dir
 	wantClusterStore := filepath.Join(filepath.Dir("./data"), "cluster_store")
 	if cfg.ClusterStorePath != wantClusterStore {
 		t.Errorf("ClusterStorePath = %q, want %q", cfg.ClusterStorePath, wantClusterStore)
@@ -92,18 +77,15 @@ func TestDefaultConfig(t *testing.T) {
 }
 
 func TestLoadFromEnv(t *testing.T) {
-	// Set specific env vars and verify they are picked up
 	t.Setenv("DLOCKSS_DATA_DIR", "/custom/data")
 	t.Setenv("DLOCKSS_DISCOVERY_TAG", "dlockss-test")
 	t.Setenv("DLOCKSS_MIN_REPLICATION", "3")
 	t.Setenv("DLOCKSS_MAX_REPLICATION", "7")
 	t.Setenv("DLOCKSS_API_PORT", "9090")
 	t.Setenv("DLOCKSS_SIGNATURE_MODE", "warn")
-	t.Setenv("DLOCKSS_USE_PUBSUB_FOR_REPLICATION", "false")
 	t.Setenv("DLOCKSS_VERBOSE_LOGGING", "true")
 	t.Setenv("DLOCKSS_CHECK_INTERVAL", "2m")
 	t.Setenv("DLOCKSS_DISK_USAGE_HIGH_WATER_MARK", "85.5")
-	t.Setenv("DLOCKSS_BACKOFF_MULTIPLIER", "3.5")
 	t.Setenv("DLOCKSS_NODE_NAME", "test-node-1")
 
 	cfg := LoadFromEnv()
@@ -114,33 +96,27 @@ func TestLoadFromEnv(t *testing.T) {
 	if cfg.DiscoveryServiceTag != "dlockss-test" {
 		t.Errorf("DiscoveryServiceTag = %q, want dlockss-test", cfg.DiscoveryServiceTag)
 	}
-	if cfg.MinReplication != 3 {
-		t.Errorf("MinReplication = %d, want 3", cfg.MinReplication)
+	if cfg.Replication.MinReplication != 3 {
+		t.Errorf("MinReplication = %d, want 3", cfg.Replication.MinReplication)
 	}
-	if cfg.MaxReplication != 7 {
-		t.Errorf("MaxReplication = %d, want 7", cfg.MaxReplication)
+	if cfg.Replication.MaxReplication != 7 {
+		t.Errorf("MaxReplication = %d, want 7", cfg.Replication.MaxReplication)
 	}
 	if cfg.APIPort != 9090 {
 		t.Errorf("APIPort = %d, want 9090", cfg.APIPort)
 	}
-	if cfg.SignatureMode != "warn" {
-		t.Errorf("SignatureMode = %q, want warn", cfg.SignatureMode)
-	}
-	if cfg.UsePubsubForReplication {
-		t.Errorf("UsePubsubForReplication = true, want false")
+	if cfg.Security.SignatureMode != "warn" {
+		t.Errorf("SignatureMode = %q, want warn", cfg.Security.SignatureMode)
 	}
 	if !cfg.VerboseLogging {
 		t.Errorf("VerboseLogging = false, want true")
 	}
-	if cfg.CheckInterval != 2*time.Minute {
-		t.Errorf("CheckInterval = %v, want 2m", cfg.CheckInterval)
+	if cfg.Replication.CheckInterval != 2*time.Minute {
+		t.Errorf("CheckInterval = %v, want 2m", cfg.Replication.CheckInterval)
 	}
 	if cfg.DiskUsageHighWaterMark != 85.5 {
 		t.Errorf("DiskUsageHighWaterMark = %f, want 85.5", cfg.DiskUsageHighWaterMark)
 	}
-	if cfg.BackoffMultiplier != 3.5 {
-		t.Errorf("BackoffMultiplier = %f, want 3.5", cfg.BackoffMultiplier)
-	}
 	if cfg.NodeName != "test-node-1" {
 		t.Errorf("NodeName = %q, want test-node-1", cfg.NodeName)
 	}
@@ -166,79 +142,79 @@ func TestLoadFromEnv_IdentityPathOverride(t *testing.T) {
 
 func TestValidate_InvalidSignatureMode(t *testing.T) {
 	cfg := DefaultConfig()
-	cfg.SignatureMode = "invalid-mode"
+	cfg.Security.SignatureMode = "invalid-mode"
 	cfg.Validate()
-	if cfg.SignatureMode != "strict" {
-		t.Errorf("SignatureMode after Validate = %q, want strict", cfg.SignatureMode)
+	if cfg.Security.SignatureMode != "strict" {
+		t.Errorf("SignatureMode after Validate = %q, want strict", cfg.Security.SignatureMode)
 	}
 }
 
 func TestValidate_ValidSignatureModes(t *testing.T) {
 	for _, mode := range []string{"off", "warn", "strict"} {
 		cfg := DefaultConfig()
-		cfg.SignatureMode = mode
+		cfg.Security.SignatureMode = mode
 		cfg.Validate()
-		if cfg.SignatureMode != mode {
-			t.Errorf("SignatureMode %q was changed to %q", mode, cfg.SignatureMode)
+		if cfg.Security.SignatureMode != mode {
+			t.Errorf("SignatureMode %q was changed to %q", mode, cfg.Security.SignatureMode)
 		}
 	}
 }
 
 func TestValidate_MaxConcurrentFileProcessing(t *testing.T) {
 	cfg := DefaultConfig()
-	cfg.MaxConcurrentFileProcessing = 0
+	cfg.Files.MaxConcurrentFileProcessing = 0
 	cfg.Validate()
-	if cfg.MaxConcurrentFileProcessing != 5 {
-		t.Errorf("MaxConcurrentFileProcessing = %d, want 5", cfg.MaxConcurrentFileProcessing)
+	if cfg.Files.MaxConcurrentFileProcessing != 5 {
+		t.Errorf("MaxConcurrentFileProcessing = %d, want 5", cfg.Files.MaxConcurrentFileProcessing)
 	}
 
-	cfg.MaxConcurrentFileProcessing = -1
+	cfg.Files.MaxConcurrentFileProcessing = -1
 	cfg.Validate()
-	if cfg.MaxConcurrentFileProcessing != 5 {
-		t.Errorf("MaxConcurrentFileProcessing (negative) = %d, want 5", cfg.MaxConcurrentFileProcessing)
+	if cfg.Files.MaxConcurrentFileProcessing != 5 {
+		t.Errorf("MaxConcurrentFileProcessing (negative) = %d, want 5", cfg.Files.MaxConcurrentFileProcessing)
 	}
 }
 
 func TestValidate_NonceSize(t *testing.T) {
 	cfg := DefaultConfig()
-	cfg.NonceSize = 0
+	cfg.Security.NonceSize = 0
 	cfg.Validate()
-	if cfg.NonceSize != 16 {
-		t.Errorf("NonceSize = %d, want 16", cfg.NonceSize)
+	if cfg.Security.NonceSize != 16 {
+		t.Errorf("NonceSize = %d, want 16", cfg.Security.NonceSize)
 	}
 
-	cfg.NonceSize = -5
+	cfg.Security.NonceSize = -5
 	cfg.Validate()
-	if cfg.NonceSize != 16 {
-		t.Errorf("NonceSize (negative) = %d, want 16", cfg.NonceSize)
+	if cfg.Security.NonceSize != 16 {
+		t.Errorf("NonceSize (negative) = %d, want 16", cfg.Security.NonceSize)
 	}
 }
 
 func TestValidate_MinNonceSize(t *testing.T) {
 	cfg := DefaultConfig()
-	cfg.MinNonceSize = 0
+	cfg.Security.MinNonceSize = 0
 	cfg.Validate()
-	if cfg.MinNonceSize != 8 {
-		t.Errorf("MinNonceSize = %d, want 8", cfg.MinNonceSize)
+	if cfg.Security.MinNonceSize != 8 {
+		t.Errorf("MinNonceSize = %d, want 8", cfg.Security.MinNonceSize)
 	}
 }
 
 func TestValidate_MinMaxReplicationSwap(t *testing.T) {
 	cfg := DefaultConfig()
-	cfg.MinReplication = 20
-	cfg.MaxReplication = 5
+	cfg.Replication.MinReplication = 20
+	cfg.Replication.MaxReplication = 5
 	cfg.Validate()
-	if cfg.MinReplication != 5 || cfg.MaxReplication != 20 {
-		t.Errorf("Min/MaxReplication not swapped: min=%d max=%d, want min=5 max=20", cfg.MinReplication, cfg.MaxReplication)
+	if cfg.Replication.MinReplication != 5 || cfg.Replication.MaxReplication != 20 {
+		t.Errorf("Min/MaxReplication not swapped: min=%d max=%d, want min=5 max=20", cfg.Replication.MinReplication, cfg.Replication.MaxReplication)
 	}
 }
 
 func TestValidate_MaxConcurrentReplicationChecks(t *testing.T) {
 	cfg := DefaultConfig()
-	cfg.MaxConcurrentReplicationChecks = 0
+	cfg.Replication.MaxConcurrentReplicationChecks = 0
 	cfg.Validate()
-	if cfg.MaxConcurrentReplicationChecks != 5 {
-		t.Errorf("MaxConcurrentReplicationChecks = %d, want 5", cfg.MaxConcurrentReplicationChecks)
+	if cfg.Replication.MaxConcurrentReplicationChecks != 5 {
+		t.Errorf("MaxConcurrentReplicationChecks = %d, want 5", cfg.Replication.MaxConcurrentReplicationChecks)
 	}
 }
 
@@ -282,7 +258,7 @@ func TestGetEnvInt_Invalid(t *testing.T) {
 }
 
 func TestGetEnvInt_Unset(t *testing.T) {
-	t.Setenv("TEST_INT_UNSET", "") // empty = unset for getEnv* semantics
+	t.Setenv("TEST_INT_UNSET", "")
 	got := getEnvInt("TEST_INT_UNSET", 7)
 	if got != 7 {
 		t.Errorf("getEnvInt(unset) = %d, want 7", got)
@@ -339,9 +315,9 @@ func TestGetEnvFloat_Unset(t *testing.T) {
 
 func TestGetEnvBool_Valid(t *testing.T) {
 	for _, tc := range []struct {
-		env   string
-		def   bool
-		want  bool
+		env  string
+		def  bool
+		want bool
 	}{
 		{"true", false, true},
 		{"1", false, true},
diff --git a/internal/fileops/fileops.go b/internal/fileops/fileops.go
index 25ebf41..7532f55 100644
--- a/internal/fileops/fileops.go
+++ b/internal/fileops/fileops.go
@@ -2,7 +2,6 @@ package fileops
 
 import (
 	"context"
-	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
@@ -14,7 +13,6 @@ import (
 	"github.com/libp2p/go-libp2p/core/peer"
 
 	"dlockss/internal/badbits"
-	"dlockss/internal/common"
 	"dlockss/internal/config"
 	"dlockss/pkg/ipfs"
 	"dlockss/pkg/schema"
@@ -28,7 +26,7 @@ const recentIngestTTL = 30 * time.Second
 // ShardIdentity provides the node's identity and shard membership queries.
 type ShardIdentity interface {
 	PeerID() peer.ID
-	GetShardInfo() (string, int)
+	GetShardInfo() string
 	AnnouncePinned(manifestCID string)
 	AmIResponsibleFor(key string) bool
 	IsLocalNodeIngestor() bool
@@ -120,7 +118,7 @@ func NewFileProcessor(cfg FileProcessorConfig) *FileProcessor {
 		storageMgr:     cfg.Storage,
 		privKey:        cfg.PrivKey,
 		signer:         cfg.Signer,
-		jobQueue:       make(chan string, cfg.Cfg.MaxConcurrentFileProcessing*100),
+		jobQueue:       make(chan string, cfg.Cfg.Files.MaxConcurrentFileProcessing*100),
 		ctx:            ctx,
 		cancel:         cancel,
 		recentIngests:  make(map[string]time.Time),
@@ -134,7 +132,7 @@ func NewFileProcessor(cfg FileProcessorConfig) *FileProcessor {
 }
 
 func (fp *FileProcessor) startWorkers() {
-	for i := 0; i < fp.cfg.MaxConcurrentFileProcessing; i++ {
+	for i := 0; i < fp.cfg.Files.MaxConcurrentFileProcessing; i++ {
 		go fp.workerLoop()
 	}
 }
@@ -247,34 +245,5 @@ func (fp *FileProcessor) EnqueueOrRetry(path string) bool {
 
 // SignProtocolMessage signs a message with the node's private key.
 func (fp *FileProcessor) SignProtocolMessage(msg schema.Signable) error {
-	if fp.signer != nil {
-		return fp.signer.SignProtocolMessage(msg)
-	}
-	if msg == nil {
-		return fmt.Errorf("message is nil")
-	}
-	nonceSize := fp.cfg.NonceSize
-	if nonceSize < 1 {
-		nonceSize = 16
-	}
-	nonce, err := common.NewNonce(nonceSize)
-	if err != nil {
-		return err
-	}
-	env := msg.GetEnvelope()
-	env.SenderID = fp.shardMgr.PeerID()
-	env.Timestamp = time.Now().Unix()
-	env.Nonce = nonce
-	env.Sig = nil
-
-	unsigned, err := msg.MarshalCBORForSigning()
-	if err != nil {
-		return err
-	}
-	sig, err := fp.privKey.Sign(unsigned)
-	if err != nil {
-		return err
-	}
-	env.Sig = sig
-	return nil
+	return fp.signer.SignProtocolMessage(msg)
 }
diff --git a/internal/fileops/fileops_process.go b/internal/fileops/fileops_process.go
index 02dc0ca..dcd56b9 100644
--- a/internal/fileops/fileops_process.go
+++ b/internal/fileops/fileops_process.go
@@ -60,7 +60,7 @@ func (fp *FileProcessor) processNewFile(path string) {
 		return
 	}
 
-	ctx, cancel := context.WithTimeout(fp.ctx, fp.cfg.FileImportTimeout)
+	ctx, cancel := context.WithTimeout(fp.ctx, fp.cfg.Files.FileImportTimeout)
 	defer cancel()
 
 	slog.Debug("importing file to IPFS", "path", path)
@@ -225,7 +225,7 @@ func (fp *FileProcessor) trackAndAnnounceFile(manifestCID cid.Cid, manifestCIDSt
 func (fp *FileProcessor) announceResponsibleFile(manifestCID cid.Cid, manifestCIDStr, payloadCIDStr string) {
 	slog.Info("responsible for file, announcing to shard", "payload", payloadCIDStr, "manifest", manifestCIDStr)
 
-	currentShard, _ := fp.shardMgr.GetShardInfo()
+	currentShard := fp.shardMgr.GetShardInfo()
 	im := schema.IngestMessage{
 		SignedEnvelope: schema.SignedEnvelope{Type: schema.MessageTypeIngest, ManifestCID: manifestCID},
 		ShardID:        currentShard,
@@ -247,12 +247,12 @@ func (fp *FileProcessor) announceResponsibleFile(manifestCID cid.Cid, manifestCI
 	// Announce both manifest and payload to the DHT so gateways can find providers.
 	// Each gets its own timeout so a slow manifest provide can't starve the payload.
 	go func() {
-		ctx1, cancel1 := context.WithTimeout(fp.ctx, fp.cfg.DHTProvideTimeout)
+		ctx1, cancel1 := context.WithTimeout(fp.ctx, fp.cfg.Files.DHTProvideTimeout)
 		defer cancel1()
 		fp.storageMgr.ProvideFile(ctx1, manifestCIDStr)
 	}()
 	go func() {
-		ctx2, cancel2 := context.WithTimeout(fp.ctx, fp.cfg.DHTProvideTimeout)
+		ctx2, cancel2 := context.WithTimeout(fp.ctx, fp.cfg.Files.DHTProvideTimeout)
 		defer cancel2()
 		fp.storageMgr.ProvideFile(ctx2, payloadCIDStr)
 	}()
@@ -261,7 +261,7 @@ func (fp *FileProcessor) announceResponsibleFile(manifestCID cid.Cid, manifestCI
 func (fp *FileProcessor) announceCustodialFile(manifestCID cid.Cid, manifestCIDStr, payloadCIDStr string) {
 	slog.Info("custodial mode, injecting into target shard", "payload", payloadCIDStr, "manifest", manifestCIDStr)
 
-	currentShard, _ := fp.shardMgr.GetShardInfo()
+	currentShard := fp.shardMgr.GetShardInfo()
 	targetDepth := len(currentShard)
 	if targetDepth == 0 {
 		targetDepth = 1
diff --git a/internal/fileops/fileops_test.go b/internal/fileops/fileops_test.go
index feecd7f..80ccea1 100644
--- a/internal/fileops/fileops_test.go
+++ b/internal/fileops/fileops_test.go
@@ -2,7 +2,6 @@ package fileops_test
 
 import (
 	"context"
-	"io"
 	"os"
 	"path/filepath"
 	"sync"
@@ -35,9 +34,6 @@ func (m *mockIPFS) ImportFile(ctx context.Context, path string) (cid.Cid, error)
 	}
 	return fakeCID("import-" + filepath.Base(path)), nil
 }
-func (m *mockIPFS) ImportReader(context.Context, io.Reader) (cid.Cid, error) {
-	return fakeCID("reader"), nil
-}
 func (m *mockIPFS) PutDagCBOR(ctx context.Context, block []byte) (cid.Cid, error) {
 	if m.putDagFn != nil {
 		return m.putDagFn(ctx, block)
@@ -48,11 +44,6 @@ func (m *mockIPFS) GetBlock(context.Context, cid.Cid) ([]byte, error) { return n
 func (m *mockIPFS) PinRecursive(context.Context, cid.Cid) error       { return nil }
 func (m *mockIPFS) UnpinRecursive(context.Context, cid.Cid) error     { return nil }
 func (m *mockIPFS) IsPinned(context.Context, cid.Cid) (bool, error)   { return false, nil }
-func (m *mockIPFS) GetFileSize(context.Context, cid.Cid) (uint64, error) {
-	return 0, nil
-}
-func (m *mockIPFS) GetPeerID(context.Context) (string, error)    { return "test-peer", nil }
-func (m *mockIPFS) SwarmConnect(context.Context, []string) error { return nil }
 
 // ---------------------------------------------------------------------------
 // Mock: ShardCoordinator (ShardIdentity + ShardPublisher + CustodialInjector)
@@ -61,7 +52,6 @@ func (m *mockIPFS) SwarmConnect(context.Context, []string) error { return nil }
 type mockShardCoordinator struct {
 	peerID      peer.ID
 	shardID     string
-	shardDepth  int
 	responsible bool
 
 	mu          sync.Mutex
@@ -71,7 +61,7 @@ type mockShardCoordinator struct {
 }
 
 func (m *mockShardCoordinator) PeerID() peer.ID               { return m.peerID }
-func (m *mockShardCoordinator) GetShardInfo() (string, int)   { return m.shardID, m.shardDepth }
+func (m *mockShardCoordinator) GetShardInfo() string          { return m.shardID }
 func (m *mockShardCoordinator) AmIResponsibleFor(string) bool { return m.responsible }
 func (m *mockShardCoordinator) IsLocalNodeIngestor() bool     { return true }
 
@@ -169,10 +159,10 @@ func testConfig(t *testing.T) *config.Config {
 	t.Helper()
 	cfg := config.DefaultConfig()
 	cfg.FileWatchFolder = t.TempDir()
-	cfg.FileStabilityDelay = 0
-	cfg.FileImportTimeout = 5 * time.Second
-	cfg.DHTProvideTimeout = 1 * time.Second
-	cfg.MaxConcurrentFileProcessing = 2
+	cfg.Files.FileStabilityDelay = 0
+	cfg.Files.FileImportTimeout = 5 * time.Second
+	cfg.Files.DHTProvideTimeout = 1 * time.Second
+	cfg.Files.MaxConcurrentFileProcessing = 2
 	return cfg
 }
 
@@ -202,7 +192,7 @@ func newTestProcessor(t *testing.T, cfg *config.Config, ipfsMock *mockIPFS, shar
 func TestNewFileProcessorAndStop(t *testing.T) {
 	cfg := testConfig(t)
 	ipfsMock := &mockIPFS{}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -217,7 +207,7 @@ func TestTryEnqueue_AcceptsFiles(t *testing.T) {
 	cfg := testConfig(t)
 	// Large concurrency * 100 = buffer size; we just need a few slots.
 	ipfsMock := &mockIPFS{}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -233,16 +223,16 @@ func TestTryEnqueue_AcceptsFiles(t *testing.T) {
 
 func TestTryEnqueue_ReturnsFalseWhenFull(t *testing.T) {
 	cfg := testConfig(t)
-	cfg.MaxConcurrentFileProcessing = 1 // queue size = 1 * 100 = 100
+	cfg.Files.MaxConcurrentFileProcessing = 1 // queue size = 1 * 100 = 100
 	ipfsMock := &mockIPFS{}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
 	fp := newTestProcessor(t, cfg, ipfsMock, shardMock, storageMock, signerMock)
 	fp.Stop() // stop workers so nothing drains the channel
 
-	queueSize := cfg.MaxConcurrentFileProcessing * 100
+	queueSize := cfg.Files.MaxConcurrentFileProcessing * 100
 	for i := 0; i < queueSize; i++ {
 		if !fp.TryEnqueue("/file") {
 			t.Fatalf("TryEnqueue should succeed for item %d/%d", i, queueSize)
@@ -256,16 +246,16 @@ func TestTryEnqueue_ReturnsFalseWhenFull(t *testing.T) {
 
 func TestEnqueueOrRetry_FallsBackToRetryQueue(t *testing.T) {
 	cfg := testConfig(t)
-	cfg.MaxConcurrentFileProcessing = 1
+	cfg.Files.MaxConcurrentFileProcessing = 1
 	ipfsMock := &mockIPFS{}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
 	fp := newTestProcessor(t, cfg, ipfsMock, shardMock, storageMock, signerMock)
 	fp.Stop()
 
-	queueSize := cfg.MaxConcurrentFileProcessing * 100
+	queueSize := cfg.Files.MaxConcurrentFileProcessing * 100
 	for i := 0; i < queueSize; i++ {
 		fp.TryEnqueue("/fill")
 	}
@@ -286,7 +276,7 @@ func TestProcessNewFile_SkipsFilesOutsideWatchDir(t *testing.T) {
 			return fakeCID("should-not-import"), nil
 		},
 	}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -316,7 +306,7 @@ func TestProcessNewFile_SkipsTmpFiles(t *testing.T) {
 			return fakeCID("should-not-import"), nil
 		},
 	}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -344,7 +334,7 @@ func TestProcessNewFile_SkipsPartFiles(t *testing.T) {
 			return fakeCID("should-not-import"), nil
 		},
 	}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -372,7 +362,7 @@ func TestScanExistingFiles(t *testing.T) {
 			return fakeCID("scan"), nil
 		},
 	}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -415,7 +405,7 @@ func TestScanExistingFiles_IncludesSubdirectories(t *testing.T) {
 			return fakeCID("sub"), nil
 		},
 	}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -455,7 +445,7 @@ func TestShouldProcessFileEvent_Deduplication(t *testing.T) {
 			return fakeCID("dedup"), nil
 		},
 	}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -504,7 +494,6 @@ func TestProcessNewFile_FullPipeline(t *testing.T) {
 	shardMock := &mockShardCoordinator{
 		peerID:      "test-peer-full",
 		shardID:     "0",
-		shardDepth:  1,
 		responsible: true,
 	}
 	storageMock := newMockStorage()
@@ -563,7 +552,7 @@ func TestProcessNewFile_CIDDedup(t *testing.T) {
 			return fakeCID("manifest-dedup"), nil
 		},
 	}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -602,7 +591,7 @@ func TestProcessNewFile_CIDDedup(t *testing.T) {
 
 func TestProcessNewFile_NilIPFSClient(t *testing.T) {
 	cfg := testConfig(t)
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
@@ -640,7 +629,7 @@ func TestProcessNewFile_NilIPFSClient(t *testing.T) {
 func TestWatchFolder_ContextCancellation(t *testing.T) {
 	cfg := testConfig(t)
 	ipfsMock := &mockIPFS{}
-	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", shardDepth: 1, responsible: true}
+	shardMock := &mockShardCoordinator{peerID: "test-peer", shardID: "0", responsible: true}
 	storageMock := newMockStorage()
 	signerMock := &mockSigner{}
 
diff --git a/internal/fileops/fileops_watch.go b/internal/fileops/fileops_watch.go
index 99c9eec..e072ed9 100644
--- a/internal/fileops/fileops_watch.go
+++ b/internal/fileops/fileops_watch.go
@@ -65,7 +65,7 @@ func (fp *FileProcessor) shouldProcessFileEvent(path string) bool {
 // waits for the file size to be unchanged for that duration before enqueueing,
 // to avoid ingesting files still being written (e.g. downloads).
 func (fp *FileProcessor) enqueueWithStabilityCheck(path string) {
-	if fp.cfg.FileStabilityDelay <= 0 {
+	if fp.cfg.Files.FileStabilityDelay <= 0 {
 		_ = fp.EnqueueOrRetry(path)
 		return
 	}
@@ -81,7 +81,7 @@ func (fp *FileProcessor) enqueueWithStabilityCheck(path string) {
 		t.Stop()
 	}
 	fp.stabilityPath[path] = currentSize
-	fp.stabilityTimer[path] = time.AfterFunc(fp.cfg.FileStabilityDelay, func() {
+	fp.stabilityTimer[path] = time.AfterFunc(fp.cfg.Files.FileStabilityDelay, func() {
 		if fp.ctx.Err() != nil {
 			return
 		}
@@ -186,95 +186,94 @@ func (fp *FileProcessor) runWatcher(ctx context.Context) error {
 			if !ok {
 				return fmt.Errorf("events channel closed unexpectedly")
 			}
+			fp.handleWatcherEvent(event, watcher, watchedDirs, &watchedDirsMu)
+		case err, ok := <-watcher.Errors:
+			if !ok {
+				return fmt.Errorf("errors channel closed unexpectedly")
+			}
+			slog.Error("watcher error", "error", err)
+		}
+	}
+}
 
-			if event.Op&fsnotify.Create == fsnotify.Create {
-				info, err := os.Stat(event.Name)
-				if err == nil && info.IsDir() {
-					watchedDirsMu.RLock()
-					alreadyWatched := watchedDirs[event.Name]
-					watchedDirsMu.RUnlock()
-
-					if !alreadyWatched {
-						if err := watcher.Add(event.Name); err != nil {
-							slog.Error("failed to watch new directory", "path", event.Name, "error", err)
-						} else {
-							watchedDirsMu.Lock()
-							watchedDirs[event.Name] = true
-							watchedDirsMu.Unlock()
-							slog.Debug("added watch for new directory", "path", event.Name)
-						}
-					}
-					go func(dirPath string) {
-						time.Sleep(fp.cfg.FileProcessingDelay)
+func (fp *FileProcessor) handleWatcherEvent(event fsnotify.Event, watcher *fsnotify.Watcher, watchedDirs map[string]bool, mu *sync.RWMutex) {
+	if event.Op&fsnotify.Create == fsnotify.Create {
+		info, err := os.Stat(event.Name)
+		if err == nil && info.IsDir() {
+			fp.handleNewDirectory(event.Name, watcher, watchedDirs, mu)
+			return
+		}
+	}
 
-						fileCount := 0
-						dirCount := 0
+	if event.Op&fsnotify.Create == fsnotify.Create || event.Op&fsnotify.Write == fsnotify.Write {
+		path := event.Name
+		if !fp.validateFilePath(path) {
+			return
+		}
+		info, err := os.Stat(path)
+		if err != nil || info.IsDir() {
+			return
+		}
+		if fp.shouldProcessFileEvent(path) {
+			fp.enqueueWithStabilityCheck(path)
+		}
+	}
+}
 
-						err := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
-							if err != nil {
-								slog.Warn("error accessing path during directory scan", "path", path, "error", err)
-								return nil
-							}
-							if info.IsDir() {
-								if path != dirPath {
-									watchedDirsMu.RLock()
-									alreadyWatched := watchedDirs[path]
-									watchedDirsMu.RUnlock()
+func (fp *FileProcessor) handleNewDirectory(dirPath string, watcher *fsnotify.Watcher, watchedDirs map[string]bool, mu *sync.RWMutex) {
+	mu.RLock()
+	alreadyWatched := watchedDirs[dirPath]
+	mu.RUnlock()
 
-									if !alreadyWatched {
-										if err := watcher.Add(path); err != nil {
-											slog.Error("failed to watch nested directory", "path", path, "error", err)
-										} else {
-											watchedDirsMu.Lock()
-											watchedDirs[path] = true
-											watchedDirsMu.Unlock()
-											slog.Debug("added watch for nested directory", "path", path)
-											dirCount++
-										}
-									}
-								}
-								return nil
-							}
-							if !fp.validateFilePath(path) {
-								slog.Debug("file filtered by validation", "path", path)
-								return nil
-							}
-							fileCount++
+	if !alreadyWatched {
+		if err := watcher.Add(dirPath); err != nil {
+			slog.Error("failed to watch new directory", "path", dirPath, "error", err)
+		} else {
+			mu.Lock()
+			watchedDirs[dirPath] = true
+			mu.Unlock()
+			slog.Debug("added watch for new directory", "path", dirPath)
+		}
+	}
 
-							fp.enqueueWithStabilityCheck(path)
-							return nil
-						})
-						if err != nil {
-							slog.Error("failed to scan new directory", "path", dirPath, "error", err)
+	go func() {
+		time.Sleep(fp.cfg.Files.FileProcessingDelay)
+		fileCount := 0
+		dirCount := 0
+		err := filepath.Walk(dirPath, func(path string, info os.FileInfo, err error) error {
+			if err != nil {
+				slog.Warn("error accessing path during directory scan", "path", path, "error", err)
+				return nil
+			}
+			if info.IsDir() {
+				if path != dirPath {
+					mu.RLock()
+					seen := watchedDirs[path]
+					mu.RUnlock()
+					if !seen {
+						if err := watcher.Add(path); err != nil {
+							slog.Error("failed to watch nested directory", "path", path, "error", err)
 						} else {
-							slog.Info("scanned directory", "path", dirPath, "files", fileCount, "nested_dirs", dirCount)
+							mu.Lock()
+							watchedDirs[path] = true
+							mu.Unlock()
+							dirCount++
 						}
-					}(event.Name)
-					continue
-				}
-			}
-
-			if event.Op&fsnotify.Create == fsnotify.Create || event.Op&fsnotify.Write == fsnotify.Write {
-				path := event.Name
-
-				if !fp.validateFilePath(path) {
-					continue
-				}
-
-				info, err := os.Stat(path)
-				if err != nil || info.IsDir() {
-					continue
-				}
-
-				if fp.shouldProcessFileEvent(path) {
-					fp.enqueueWithStabilityCheck(path)
+					}
 				}
+				return nil
 			}
-		case err, ok := <-watcher.Errors:
-			if !ok {
-				return fmt.Errorf("errors channel closed unexpectedly")
+			if !fp.validateFilePath(path) {
+				return nil
 			}
-			slog.Error("watcher error", "error", err)
+			fileCount++
+			fp.enqueueWithStabilityCheck(path)
+			return nil
+		})
+		if err != nil {
+			slog.Error("failed to scan new directory", "path", dirPath, "error", err)
+		} else {
+			slog.Info("scanned directory", "path", dirPath, "files", fileCount, "nested_dirs", dirCount)
 		}
-	}
+	}()
 }
diff --git a/internal/identity/identity.go b/internal/identity/identity.go
index afca7d9..32a50db 100644
--- a/internal/identity/identity.go
+++ b/internal/identity/identity.go
@@ -126,7 +126,7 @@ func loadOrCreate(cfg *config.Config) (crypto.PrivKey, error) {
 //  3. Interactive prompt on stdin
 func ResolveNodeName(cfg *config.Config) string {
 	if cfg.NodeName != "" {
-		if err := PersistNodeName(cfg, cfg.NodeName); err != nil {
+		if err := persistNodeName(cfg, cfg.NodeName); err != nil {
 			slog.Warn("failed to persist node name", "error", err)
 		}
 		return cfg.NodeName
@@ -142,7 +142,7 @@ func ResolveNodeName(cfg *config.Config) string {
 	scanner := bufio.NewScanner(os.Stdin)
 	if scanner.Scan() {
 		if name := strings.TrimSpace(scanner.Text()); name != "" {
-			if err := PersistNodeName(cfg, name); err != nil {
+			if err := persistNodeName(cfg, name); err != nil {
 				slog.Warn("failed to persist node name", "error", err)
 			}
 			return name
@@ -151,8 +151,7 @@ func ResolveNodeName(cfg *config.Config) string {
 	return ""
 }
 
-// PersistNodeName writes the node name to disk so it survives restarts.
-func PersistNodeName(cfg *config.Config, name string) error {
+func persistNodeName(cfg *config.Config, name string) error {
 	nameFile := cfg.NodeNamePath
 	ensureDir(nameFile)
 	if err := os.WriteFile(nameFile, []byte(name+"\n"), 0644); err != nil {
diff --git a/internal/managers/clusters/inspect_test.go b/internal/managers/clusters/inspect_test.go
deleted file mode 100644
index 6e830c3..0000000
--- a/internal/managers/clusters/inspect_test.go
+++ /dev/null
@@ -1,31 +0,0 @@
-package clusters
-
-import (
-	"testing"
-)
-
-func TestInspectCRDT(t *testing.T) {
-	// Setup dummy dependencies
-	// h, _ := libp2p.New()
-	// defer h.Close()
-	// dstore := dssync.MutexWrap(datastore.NewMapDatastore())
-
-	// cfg := &crdt.Config{
-	// 	ClusterName:         "test",
-	// 	PeersetMetric:       "ping",
-	// 	RebroadcastInterval: 1 * time.Minute,
-	// 	Batching:            crdt.BatchingConfig{MaxBatchSize: 1},
-	// }
-
-	// We can't easily instantiate it fully without mocking DHT/PubSub which causes it to hang or fail
-	// But we can inspect the type via reflection or just looking at what methods are available
-	// if we had the source. Since we don't, we can try to cast or check interfaces.
-
-	// Let's just print the methods of the *crdt.Consensus type if we can instantiate it.
-	// But New() blocks or fails.
-
-	// Instead, let's trust the web search and general knowledge:
-	// CRDT implementations usually don't have a public "OnUpdate" channel.
-
-	t.Log("Skipping runtime inspection")
-}
diff --git a/internal/managers/clusters/manager.go b/internal/managers/clusters/manager.go
index 3322bb8..6445751 100644
--- a/internal/managers/clusters/manager.go
+++ b/internal/managers/clusters/manager.go
@@ -23,12 +23,11 @@ import (
 	"github.com/libp2p/go-libp2p/core/host"
 	"github.com/libp2p/go-libp2p/core/peer"
 	"github.com/libp2p/go-libp2p/core/routing"
-	"github.com/multiformats/go-multiaddr"
 )
 
 // ClusterManagerInterface defines the interface for ClusterManager to allow mocking.
 type ClusterManagerInterface interface {
-	JoinShard(ctx context.Context, shardID string, bootstrapPeers []multiaddr.Multiaddr) error
+	JoinShard(ctx context.Context, shardID string) error
 	LeaveShard(shardID string) error
 	Pin(ctx context.Context, shardID string, c cid.Cid, replicationFactorMin, replicationFactorMax int) error
 	PinIfAbsent(ctx context.Context, shardID string, c cid.Cid, replicationFactorMin, replicationFactorMax int) error
@@ -60,11 +59,10 @@ type ClusterManager struct {
 	peerProvider ShardPeerProvider
 
 	mu       sync.RWMutex
-	clusters map[string]*EmbeddedCluster
+	clusters map[string]*embeddedCluster
 }
 
-// ConsensusClient defines the interface for interacting with the consensus component.
-type ConsensusClient interface {
+type consensusClient interface {
 	LogPin(ctx context.Context, pin api.Pin) error
 	LogUnpin(ctx context.Context, pin api.Pin) error
 	State(ctx context.Context) (state.ReadOnly, error)
@@ -72,16 +70,10 @@ type ConsensusClient interface {
 	Shutdown(ctx context.Context) error
 }
 
-// EmbeddedCluster represents a single shard's consensus state (CRDT).
-type EmbeddedCluster struct {
-	ShardID string
-	// Consensus holds the CRDT state for this shard
-	Consensus ConsensusClient
-	// PinTracker syncs consensus to IPFS
-	PinTracker *LocalPinTracker
-
-	ctx    context.Context
-	cancel context.CancelFunc
+type embeddedCluster struct {
+	consensus  consensusClient
+	pinTracker *localPinTracker
+	cancel     context.CancelFunc
 }
 
 // ClusterManagerConfig holds all dependencies for a ClusterManager.
@@ -110,7 +102,7 @@ func NewClusterManager(cfg ClusterManagerConfig) *ClusterManager {
 		trustedPeers: cfg.TrustedPeers,
 		onPinSynced:  cfg.OnPinSynced,
 		onPinRemoved: cfg.OnPinRemoved,
-		clusters:     make(map[string]*EmbeddedCluster),
+		clusters:     make(map[string]*embeddedCluster),
 	}
 }
 
@@ -122,8 +114,7 @@ func (cm *ClusterManager) SetShardPeerProvider(provider ShardPeerProvider) {
 }
 
 // JoinShard initializes a new embedded cluster for the given shard.
-// secret is the deterministically generated shared key for the cluster.
-func (cm *ClusterManager) JoinShard(ctx context.Context, shardID string, bootstrapPeers []multiaddr.Multiaddr) error {
+func (cm *ClusterManager) JoinShard(ctx context.Context, shardID string) error {
 	cm.mu.Lock()
 	defer cm.mu.Unlock()
 
@@ -136,7 +127,7 @@ func (cm *ClusterManager) JoinShard(ctx context.Context, shardID string, bootstr
 
 	// Configure CRDT
 	trustAll := true
-	if cm.cfg.TrustMode == "allowlist" {
+	if cm.cfg.Security.TrustMode == "allowlist" {
 		trustAll = false
 	}
 
@@ -194,7 +185,7 @@ func (cm *ClusterManager) JoinShard(ctx context.Context, shardID string, bootstr
 
 	subCtx, cancel := context.WithCancel(context.Background())
 
-	tracker := NewLocalPinTracker(cm.ipfsClient, shardID, cm.onPinSynced, cm.onPinRemoved, cm.badBits)
+	tracker := newLocalPinTracker(cm.ipfsClient, shardID, cm.onPinSynced, cm.onPinRemoved, cm.badBits)
 	tracker.Start(consensus)
 
 	go func() {
@@ -214,11 +205,9 @@ func (cm *ClusterManager) JoinShard(ctx context.Context, shardID string, bootstr
 		}
 	}()
 
-	cm.clusters[shardID] = &EmbeddedCluster{
-		ShardID:    shardID,
-		Consensus:  consensus,
-		PinTracker: tracker,
-		ctx:        subCtx,
+	cm.clusters[shardID] = &embeddedCluster{
+		consensus:  consensus,
+		pinTracker: tracker,
 		cancel:     cancel,
 	}
 
@@ -237,40 +226,19 @@ func (cm *ClusterManager) LeaveShard(shardID string) error {
 	cm.mu.Unlock()
 
 	slog.Info("shutting down embedded cluster", "shard", shardID)
-	if cluster.PinTracker != nil {
-		cluster.PinTracker.Stop()
+	if cluster.pinTracker != nil {
+		cluster.pinTracker.Stop()
 	}
 	shutCtx, shutCancel := context.WithTimeout(context.Background(), 15*time.Second)
 	defer shutCancel()
 	var shutdownErr error
-	if err := cluster.Consensus.Shutdown(shutCtx); err != nil {
+	if err := cluster.consensus.Shutdown(shutCtx); err != nil {
 		shutdownErr = fmt.Errorf("consensus shutdown for shard %s: %w", shardID, err)
 	}
 	cluster.cancel()
 	return shutdownErr
 }
 
-// Shutdown gracefully shuts down all embedded clusters.
-func (cm *ClusterManager) Shutdown() error {
-	cm.mu.Lock()
-	shards := make([]string, 0, len(cm.clusters))
-	for shardID := range cm.clusters {
-		shards = append(shards, shardID)
-	}
-	cm.mu.Unlock()
-
-	var firstErr error
-	for _, shardID := range shards {
-		if err := cm.LeaveShard(shardID); err != nil {
-			if firstErr == nil {
-				firstErr = err
-			}
-			slog.Error("failed to leave shard during shutdown", "shard", shardID, "error", err)
-		}
-	}
-	return firstErr
-}
-
 // SelectAllocations deterministically chooses n peers from sorted list for the given CID (same CID → same set on all nodes).
 // Exported for tests.
 func SelectAllocations(peers []peer.ID, c cid.Cid, n int) []peer.ID {
@@ -293,7 +261,7 @@ func SelectAllocations(peers []peer.ID, c cid.Cid, n int) []peer.ID {
 	return out
 }
 
-// Pin submits a pin to the shard's cluster. Use context with long timeout (e.g. CRDTOpTimeout).
+// Pin submits a pin to the shard's cluster.
 func (cm *ClusterManager) Pin(ctx context.Context, shardID string, c cid.Cid, replicationFactorMin, replicationFactorMax int) error {
 	cm.mu.RLock()
 	cluster, exists := cm.clusters[shardID]
@@ -308,15 +276,15 @@ func (cm *ClusterManager) Pin(ctx context.Context, shardID string, c cid.Cid, re
 	repMin := replicationFactorMin
 	repMax := replicationFactorMax
 	if repMin < 0 {
-		repMin = cm.cfg.MinReplication
+		repMin = cm.cfg.Replication.MinReplication
 	}
 	if repMax < 0 {
-		repMax = cm.cfg.MaxReplication
+		repMax = cm.cfg.Replication.MaxReplication
 	}
 
 	var allocations []peer.ID
 	if repMin > 0 || repMax > 0 {
-		peers, err := cluster.Consensus.Peers(ctx)
+		peers, err := cluster.consensus.Peers(ctx)
 		if err != nil {
 			slog.Warn("failed to get peers, using full replication", "shard", shardID, "error", err)
 		}
@@ -337,8 +305,8 @@ func (cm *ClusterManager) Pin(ctx context.Context, shardID string, c cid.Cid, re
 	} else {
 		// repMin=0 && repMax=0: full replication mode (used during migration).
 		// Store config defaults as metadata but leave Allocations empty so all nodes pin.
-		repMin = cm.cfg.MinReplication
-		repMax = cm.cfg.MaxReplication
+		repMin = cm.cfg.Replication.MinReplication
+		repMax = cm.cfg.Replication.MaxReplication
 	}
 
 	pin := api.Pin{
@@ -350,7 +318,7 @@ func (cm *ClusterManager) Pin(ctx context.Context, shardID string, c cid.Cid, re
 	pin.ReplicationFactorMin = repMin
 	pin.ReplicationFactorMax = repMax
 
-	if err := cluster.Consensus.LogPin(ctx, pin); err != nil {
+	if err := cluster.consensus.LogPin(ctx, pin); err != nil {
 		return fmt.Errorf("failed to log pin to CRDT: %w", err)
 	}
 
@@ -384,7 +352,7 @@ func (cm *ClusterManager) Unpin(ctx context.Context, shardID string, c cid.Cid)
 		Type: api.DataType,
 	}
 
-	if err := cluster.Consensus.LogUnpin(ctx, pin); err != nil {
+	if err := cluster.consensus.LogUnpin(ctx, pin); err != nil {
 		return fmt.Errorf("failed to log unpin to CRDT: %w", err)
 	}
 
@@ -402,13 +370,11 @@ func (cm *ClusterManager) GetAllocations(ctx context.Context, shardID string, c
 		return nil, fmt.Errorf("not a member of shard %s", shardID)
 	}
 
-	st, err := cluster.Consensus.State(ctx)
+	st, err := cluster.consensus.State(ctx)
 	if err != nil {
 		return nil, err
 	}
 
-	// List streams pins to a channel. Use a cancellable context so the List
-	// goroutine exits promptly when we find our CID and stop reading.
 	listCtx, listCancel := context.WithCancel(ctx)
 	defer listCancel()
 
@@ -426,7 +392,6 @@ func (cm *ClusterManager) GetAllocations(ctx context.Context, shardID string, c
 }
 
 // ListPins returns all pins in the shard's consensus state (CRDT).
-// Useful for migration, replication checks, and API/monitor.
 func (cm *ClusterManager) ListPins(ctx context.Context, shardID string) ([]api.Pin, error) {
 	cm.mu.RLock()
 	cluster, exists := cm.clusters[shardID]
@@ -436,12 +401,11 @@ func (cm *ClusterManager) ListPins(ctx context.Context, shardID string) ([]api.P
 		return nil, fmt.Errorf("not a member of shard %s", shardID)
 	}
 
-	st, err := cluster.Consensus.State(ctx)
+	st, err := cluster.consensus.State(ctx)
 	if err != nil {
 		return nil, err
 	}
 
-	// state.List closes out when done; do not close it here (double-close causes panic).
 	out := make(chan api.Pin)
 	go func() {
 		_ = st.List(ctx, out)
@@ -454,7 +418,6 @@ func (cm *ClusterManager) ListPins(ctx context.Context, shardID string) ([]api.P
 	return pins, nil
 }
 
-// GetPeerCount returns the number of peers in the shard's consensus cluster.
 func (cm *ClusterManager) GetPeerCount(ctx context.Context, shardID string) (int, error) {
 	cm.mu.RLock()
 	cluster, exists := cm.clusters[shardID]
@@ -464,7 +427,7 @@ func (cm *ClusterManager) GetPeerCount(ctx context.Context, shardID string) (int
 		return 0, fmt.Errorf("not a member of shard %s", shardID)
 	}
 
-	peers, err := cluster.Consensus.Peers(ctx)
+	peers, err := cluster.consensus.Peers(ctx)
 	if err != nil {
 		return 0, err
 	}
@@ -476,43 +439,8 @@ func (cm *ClusterManager) TriggerSync(shardID string) {
 	cm.mu.RLock()
 	cluster, exists := cm.clusters[shardID]
 	cm.mu.RUnlock()
-	if !exists || cluster.PinTracker == nil {
+	if !exists || cluster.pinTracker == nil {
 		return
 	}
-	cluster.PinTracker.TriggerSync()
-}
-
-// GetClusterMetrics returns cluster-style metrics per shard for telemetry.
-// Implements telemetry.ClusterInfoProvider.
-func (cm *ClusterManager) GetClusterMetrics(ctx context.Context) (pinsPerShard, peersPerShard, allocationsTotalPerShard map[string]int, err error) {
-	cm.mu.RLock()
-	shardIDs := make([]string, 0, len(cm.clusters))
-	for id := range cm.clusters {
-		shardIDs = append(shardIDs, id)
-	}
-	cm.mu.RUnlock()
-
-	pinsPerShard = make(map[string]int)
-	peersPerShard = make(map[string]int)
-	allocationsTotalPerShard = make(map[string]int)
-
-	for _, shardID := range shardIDs {
-		pins, err := cm.ListPins(ctx, shardID)
-		if err != nil {
-			return nil, nil, nil, err
-		}
-		pinsPerShard[shardID] = len(pins)
-		allocTotal := 0
-		for _, pin := range pins {
-			allocTotal += len(pin.Allocations)
-		}
-		allocationsTotalPerShard[shardID] = allocTotal
-
-		peerCount, err := cm.GetPeerCount(ctx, shardID)
-		if err != nil {
-			return nil, nil, nil, err
-		}
-		peersPerShard[shardID] = peerCount
-	}
-	return pinsPerShard, peersPerShard, allocationsTotalPerShard, nil
+	cluster.pinTracker.TriggerSync()
 }
diff --git a/internal/managers/clusters/manager_test.go b/internal/managers/clusters/manager_test.go
index 2770e33..64bd82b 100644
--- a/internal/managers/clusters/manager_test.go
+++ b/internal/managers/clusters/manager_test.go
@@ -55,7 +55,7 @@ func TestClusterManager_Lifecycle(t *testing.T) {
 
 	// 3. Test JoinShard (Primary Shard "1")
 	shard1 := "1"
-	err = cm.JoinShard(ctx, shard1, nil)
+	err = cm.JoinShard(ctx, shard1)
 	if err != nil {
 		t.Fatalf("JoinShard failed for %s: %v", shard1, err)
 	}
@@ -63,7 +63,7 @@ func TestClusterManager_Lifecycle(t *testing.T) {
 
 	// 4. Test JoinShard (Secondary Shard "10" - Dual Homing)
 	shard10 := "10"
-	err = cm.JoinShard(ctx, shard10, nil)
+	err = cm.JoinShard(ctx, shard10)
 	if err != nil {
 		t.Fatalf("JoinShard failed for %s: %v", shard10, err)
 	}
diff --git a/internal/managers/clusters/pintracker.go b/internal/managers/clusters/pintracker.go
index ff18cb0..e82288b 100644
--- a/internal/managers/clusters/pintracker.go
+++ b/internal/managers/clusters/pintracker.go
@@ -13,30 +13,18 @@ import (
 	"github.com/ipfs/go-cid"
 )
 
-// IPFSPinner defines the minimal IPFS operations needed by the pin tracker.
-type IPFSPinner interface {
+type ipfsPinner interface {
 	PinRecursive(ctx context.Context, c cid.Cid) error
 	IsPinned(ctx context.Context, c cid.Cid) (bool, error)
 	GetBlock(ctx context.Context, c cid.Cid) ([]byte, error)
 }
 
-// OnPinSynced is called when a pin is present locally (after sync or already pinned).
-// Used so the node can register the CID with storage and announce it (e.g. PINNED on pubsub),
-// allowing the monitor to count replication per file.
-type OnPinSynced func(cid string)
-
-// OnPinRemoved is called when we unpin a CID (no longer allocated). Used so storage/heartbeat count stays correct.
-type OnPinRemoved func(cid string)
-
-// LocalPinTracker monitors the CRDT state and syncs it to the local IPFS node.
-// It acts as a bridge between the Cluster Consensus and the actual IPFS Daemon.
-// Tracks which CIDs we pinned from this shard so we can unpin when no longer allocated.
-type LocalPinTracker struct {
-	ipfsClient   IPFSPinner
+type localPinTracker struct {
+	ipfsClient   ipfsPinner
 	badBits      *badbits.Filter
 	shardID      string
-	onPinSynced  OnPinSynced
-	onPinRemoved OnPinRemoved
+	onPinSynced  func(cid string)
+	onPinRemoved func(cid string)
 
 	// State
 	mu sync.RWMutex
@@ -55,7 +43,7 @@ type LocalPinTracker struct {
 // isLegacyManifest fetches the block for a CID and checks if it's a manifest
 // with a legacy timestamp field. Returns false if the block can't be fetched
 // or decoded (non-manifest CIDs, unavailable blocks).
-func (pt *LocalPinTracker) isLegacyManifest(c cid.Cid) bool {
+func (pt *localPinTracker) isLegacyManifest(c cid.Cid) bool {
 	ctx, cancel := context.WithTimeout(pt.ctx, 5*time.Second)
 	defer cancel()
 	data, err := pt.ipfsClient.GetBlock(ctx, c)
@@ -69,9 +57,9 @@ func (pt *LocalPinTracker) isLegacyManifest(c cid.Cid) bool {
 	return ro.HasLegacyTimestamp
 }
 
-func NewLocalPinTracker(ipfsClient IPFSPinner, shardID string, onPinSynced OnPinSynced, onPinRemoved OnPinRemoved, badBits *badbits.Filter) *LocalPinTracker {
+func newLocalPinTracker(ipfsClient ipfsPinner, shardID string, onPinSynced func(string), onPinRemoved func(string), badBits *badbits.Filter) *localPinTracker {
 	ctx, cancel := context.WithCancel(context.Background())
-	return &LocalPinTracker{
+	return &localPinTracker{
 		ipfsClient:   ipfsClient,
 		badBits:      badBits,
 		shardID:      shardID,
@@ -85,7 +73,7 @@ func NewLocalPinTracker(ipfsClient IPFSPinner, shardID string, onPinSynced OnPin
 }
 
 // TriggerSync forces an immediate sync check.
-func (pt *LocalPinTracker) TriggerSync() {
+func (pt *localPinTracker) TriggerSync() {
 	select {
 	case pt.trigger <- struct{}{}:
 	default:
@@ -95,15 +83,15 @@ func (pt *LocalPinTracker) TriggerSync() {
 
 // Start begins monitoring the consensus state and syncing pins.
 // consensusClient is the CRDT component to watch.
-func (pt *LocalPinTracker) Start(consensusClient ConsensusClient) {
-	go pt.syncLoop(consensusClient)
+func (pt *localPinTracker) Start(cc consensusClient) {
+	go pt.syncLoop(cc)
 }
 
-func (pt *LocalPinTracker) Stop() {
+func (pt *localPinTracker) Stop() {
 	pt.cancel()
 }
 
-func (pt *LocalPinTracker) syncLoop(consensus ConsensusClient) {
+func (pt *localPinTracker) syncLoop(consensus consensusClient) {
 	ticker := time.NewTicker(10 * time.Second) // Poll state every 10s so peers replicate sooner
 	defer ticker.Stop()
 
@@ -119,7 +107,7 @@ func (pt *LocalPinTracker) syncLoop(consensus ConsensusClient) {
 	}
 }
 
-func (pt *LocalPinTracker) syncState(consensus ConsensusClient) {
+func (pt *localPinTracker) syncState(consensus consensusClient) {
 	// 1. Get Global State
 	state, err := consensus.State(pt.ctx)
 	if err != nil {
diff --git a/internal/managers/clusters/pintracker_test.go b/internal/managers/clusters/pintracker_test.go
index 3c6a69a..dcfa515 100644
--- a/internal/managers/clusters/pintracker_test.go
+++ b/internal/managers/clusters/pintracker_test.go
@@ -53,7 +53,7 @@ func (m *mockConsensus) Peers(ctx context.Context) ([]peer.ID, error) {
 }
 func (m *mockConsensus) Shutdown(ctx context.Context) error { return nil }
 
-// mockIPFSForTracker implements IPFSPinner and records Pin calls.
+// mockIPFSForTracker implements ipfsPinner and records Pin calls.
 type mockIPFSForTracker struct {
 	mu       sync.Mutex
 	pinCalls []cid.Cid
@@ -75,7 +75,7 @@ func TestPinTracker_allocation_pin(t *testing.T) {
 	ourPeer := testutil.MustPeerID(t, "our")
 	c1, _ := cid.Decode("bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy")
 	ipfs := &mockIPFSForTracker{}
-	pt := NewLocalPinTracker(ipfs, "1", nil, nil, nil)
+	pt := newLocalPinTracker(ipfs, "1", nil, nil, nil)
 
 	state := &mockState{
 		pins: []api.Pin{
@@ -104,7 +104,7 @@ func TestPinTracker_allocation_skip(t *testing.T) {
 	otherPeer := testutil.MustPeerID(t, "other")
 	c1, _ := cid.Decode("bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy")
 	ipfs := &mockIPFSForTracker{}
-	pt := NewLocalPinTracker(ipfs, "1", nil, nil, nil)
+	pt := newLocalPinTracker(ipfs, "1", nil, nil, nil)
 
 	state := &mockState{
 		pins: []api.Pin{
@@ -126,7 +126,7 @@ func TestPinTracker_allocation_skip(t *testing.T) {
 func TestPinTracker_empty_allocations_full_replication(t *testing.T) {
 	c1, _ := cid.Decode("bafkreigh2akiscaildcqabsyg3dfr6chu3fgpregiymsck7e7aqa4s52zy")
 	ipfs := &mockIPFSForTracker{}
-	pt := NewLocalPinTracker(ipfs, "1", nil, nil, nil)
+	pt := newLocalPinTracker(ipfs, "1", nil, nil, nil)
 
 	// Empty Allocations means "pin everywhere" (full replication)
 	state := &mockState{
@@ -155,7 +155,7 @@ func TestPinTracker_tracking_released_when_removed_from_CRDT(t *testing.T) {
 	removed := make([]string, 0)
 	onRemoved := func(cidStr string) { removed = append(removed, cidStr) }
 	ipfs := &mockIPFSForTracker{}
-	pt := NewLocalPinTracker(ipfs, "1", nil, onRemoved, nil)
+	pt := newLocalPinTracker(ipfs, "1", nil, onRemoved, nil)
 
 	stateWithPin := &mockState{
 		pins: []api.Pin{
@@ -178,7 +178,7 @@ func TestPinTracker_tracking_released_when_removed_from_CRDT(t *testing.T) {
 	pt.syncState(consensusEmpty)
 
 	// PinTracker should NOT call UnpinRecursive (migration-safe);
-	// the IPFSPinner interface doesn't even include UnpinRecursive.
+	// the ipfsPinner interface doesn't even include UnpinRecursive.
 	// But onPinRemoved callback should have been called.
 	if len(removed) != 1 {
 		t.Errorf("expected onPinRemoved called once, got %d", len(removed))
diff --git a/internal/managers/shard/shard.go b/internal/managers/shard/shard.go
index 1070214..c52e143 100644
--- a/internal/managers/shard/shard.go
+++ b/internal/managers/shard/shard.go
@@ -9,6 +9,7 @@ import (
 	"sync/atomic"
 	"time"
 
+	"github.com/ipfs/go-cid"
 	pubsub "github.com/libp2p/go-libp2p-pubsub"
 	"github.com/libp2p/go-libp2p/core/host"
 	"github.com/libp2p/go-libp2p/core/peer"
@@ -16,7 +17,6 @@ import (
 	"dlockss/internal/common"
 	"dlockss/internal/config"
 	"dlockss/internal/managers/clusters"
-	"dlockss/internal/telemetry"
 	"dlockss/pkg/ipfs"
 	"dlockss/pkg/schema"
 )
@@ -96,7 +96,6 @@ type ShardManager struct {
 	ipfsClient  ipfs.IPFSClient
 	storageMgr  StorageProvider
 	clusterMgr  clusters.ClusterManagerInterface
-	metrics     *telemetry.MetricsManager
 	signer      MessageAuthenticator
 	rateLimiter *common.RateLimiter
 	nodeName    string
@@ -105,7 +104,7 @@ type ShardManager struct {
 	ingestAllowlist map[peer.ID]struct{}
 
 	// Peer tracking
-	peers *PeerTracker
+	peers *peerTracker
 
 	// Shard membership (protected by mu)
 	mu           sync.RWMutex
@@ -122,17 +121,16 @@ type ShardManager struct {
 	lastShardMove         time.Time // set on ANY shard transition (split, merge, discovery)
 
 	// Message handling (protected by mu)
-	msgCounter            int
 	lastMessageTime       time.Time
 	lastProbeResponseTime time.Time
 
 	// Replication state
-	reshardedFiles             *common.KnownFiles
-	orphanHandoffSent          map[string]map[string]*orphanHandoffInfo
-	replicationRequestMu       sync.Mutex
-	replicationRequestLastSent map[string]time.Time
-	autoReplicationSem         chan struct{}
-	reprovideInFlight          atomic.Bool
+	reshardedFiles    *common.KnownFiles
+	orphanHandoffSent map[string]map[string]*orphanHandoffInfo
+	reprovideInFlight atomic.Bool
+
+	// Replication: request sending and handling (delegated)
+	repl *replicationManager
 
 	// Lifecycle: split/merge/discovery (delegated)
 	lifecycle *lifecycleManager
@@ -151,7 +149,6 @@ type ShardManagerConfig struct {
 	PubSub      *pubsub.PubSub
 	IPFSClient  ipfs.IPFSClient
 	Storage     StorageProvider
-	Metrics     *telemetry.MetricsManager
 	Signer      MessageAuthenticator
 	RateLimiter *common.RateLimiter
 	Cluster     clusters.ClusterManagerInterface
@@ -170,31 +167,29 @@ func NewShardManager(cfg ShardManagerConfig) (*ShardManager, error) {
 		allowlist[pid] = struct{}{}
 	}
 	sm := &ShardManager{
-		ctx:                        cfg.Ctx,
-		cfg:                        cfg.Cfg,
-		h:                          cfg.Host,
-		ps:                         cfg.PubSub,
-		ipfsClient:                 cfg.IPFSClient,
-		storageMgr:                 cfg.Storage,
-		clusterMgr:                 cfg.Cluster,
-		metrics:                    cfg.Metrics,
-		signer:                     cfg.Signer,
-		rateLimiter:                cfg.RateLimiter,
-		nodeName:                   cfg.NodeName,
-		ingestAllowlist:            allowlist,
-		peers:                      NewPeerTracker(cfg.Host.ID()),
-		reshardedFiles:             common.NewKnownFiles(),
-		currentShard:               cfg.StartShard,
-		shardSubs:                  make(map[string]*shardSubscription),
-		probeTopicCache:            make(map[string]*pubsub.Topic),
-		observerOnlyShards:         make(map[string]struct{}),
-		orphanHandoffSent:          make(map[string]map[string]*orphanHandoffInfo),
-		replicationRequestLastSent: make(map[string]time.Time),
-		autoReplicationSem:         make(chan struct{}, cfg.Cfg.MaxConcurrentReplicationChecks),
+		ctx:                cfg.Ctx,
+		cfg:                cfg.Cfg,
+		h:                  cfg.Host,
+		ps:                 cfg.PubSub,
+		ipfsClient:         cfg.IPFSClient,
+		storageMgr:         cfg.Storage,
+		clusterMgr:         cfg.Cluster,
+		signer:             cfg.Signer,
+		rateLimiter:        cfg.RateLimiter,
+		nodeName:           cfg.NodeName,
+		ingestAllowlist:    allowlist,
+		peers:              newPeerTracker(cfg.Host.ID()),
+		reshardedFiles:     common.NewKnownFiles(),
+		currentShard:       cfg.StartShard,
+		shardSubs:          make(map[string]*shardSubscription),
+		probeTopicCache:    make(map[string]*pubsub.Topic),
+		observerOnlyShards: make(map[string]struct{}),
+		orphanHandoffSent:  make(map[string]map[string]*orphanHandoffInfo),
 	}
+	sm.repl = newReplicationManager(sm, cfg.Cfg.Replication.MaxConcurrentReplicationChecks)
 	sm.lifecycle = newLifecycleManager(func() context.Context { return sm.ctx }, cfg.Cfg, sm)
 
-	if err := sm.clusterMgr.JoinShard(cfg.Ctx, cfg.StartShard, nil); err != nil {
+	if err := sm.clusterMgr.JoinShard(cfg.Ctx, cfg.StartShard); err != nil {
 		return nil, fmt.Errorf("join cluster for start shard %s: %w", cfg.StartShard, err)
 	}
 
@@ -214,7 +209,7 @@ func (sm *ShardManager) Run() {
 	go sm.lifecycle.runSplitRebroadcast()
 	go sm.runHeartbeat()
 	go sm.runOrphanUnpinLoop()
-	go sm.runReplicationChecker()
+	go sm.repl.runChecker()
 	go sm.runReannouncePinsLoop()
 	go sm.runReshardedFilesSaveLoop()
 	go sm.runLegacyManifestCleanup()
@@ -281,15 +276,36 @@ func (sm *ShardManager) localPeerID() peer.ID {
 	return sm.h.ID()
 }
 
-func (sm *ShardManager) incrementShardSplits() {
-	sm.metrics.IncrementShardSplits()
+func (sm *ShardManager) pruneStaleSeenPeers() {
+	sm.peers.PruneStale(sm.cfg.Sharding.PruneStalePeersInterval)
 }
 
-func (sm *ShardManager) pruneStaleSeenPeers() {
-	sm.peers.PruneStale(sm.cfg.PruneStalePeersInterval)
+// --- replicationOps implementation ---
+
+func (sm *ShardManager) replicationContext() context.Context { return sm.ctx }
+func (sm *ShardManager) replicationConfig() *config.Config   { return sm.cfg }
+func (sm *ShardManager) getPinnedManifests() []string        { return sm.storageMgr.GetPinnedManifests() }
+func (sm *ShardManager) isPinned(key string) bool            { return sm.storageMgr.IsPinned(key) }
+func (sm *ShardManager) publishCBOR(data []byte, shardID string) {
+	sm.PublishToShardCBOR(data, shardID)
+}
+func (sm *ShardManager) replicationSigner() MessageAuthenticator { return sm.signer }
+func (sm *ShardManager) clusterTriggerSync(shardID string)       { sm.clusterMgr.TriggerSync(shardID) }
+func (sm *ShardManager) ipfsPinRecursive(ctx context.Context, c cid.Cid) error {
+	return sm.ipfsClient.PinRecursive(ctx, c)
+}
+func (sm *ShardManager) ensureCluster(ctx context.Context, shardID string) error {
+	return sm.EnsureClusterForShard(ctx, shardID)
+}
+func (sm *ShardManager) clusterPinIfAbsent(ctx context.Context, shardID string, c cid.Cid) error {
+	return sm.clusterMgr.PinIfAbsent(ctx, shardID, c, -1, -1)
+}
+func (sm *ShardManager) isLegacyManifest(cidStr string) bool {
+	ctx, cancel := context.WithTimeout(sm.ctx, 5*time.Second)
+	defer cancel()
+	return common.IsLegacyManifest(ctx, sm.ipfsClient, cidStr)
 }
 
-// moveToShard switches shard: join new, migrate pins, leave old. Used by split, discovery, merge.
 func (sm *ShardManager) moveToShard(fromShard, toShard string, isMergeUp bool) {
 	sm.mu.Lock()
 	if sm.currentShard != fromShard {
@@ -297,7 +313,6 @@ func (sm *ShardManager) moveToShard(fromShard, toShard string, isMergeUp bool) {
 		return
 	}
 	sm.currentShard = toShard
-	sm.msgCounter = 0
 	sm.reshardedFiles = common.NewKnownFiles()
 	sm.lastShardMove = time.Now()
 	if isMergeUp {
@@ -308,77 +323,84 @@ func (sm *ShardManager) moveToShard(fromShard, toShard string, isMergeUp bool) {
 	sm.mu.Unlock()
 	sm.lifecycle.onShardTransition()
 
-	// Immediately announce departure from the old shard so other peers stop
-	// counting us as ACTIVE.  The actual topic unsubscription happens later
-	// (after ShardOverlapDuration) to allow continued message reception for
-	// data migration, but other nodes need to drop us from their peer counts
-	// now — otherwise stale entries inflate getShardPeerCountForSplit() and
-	// can trigger premature splits.
+	sm.publishLeaveFromShard(fromShard)
+
+	if err := sm.JoinShard(toShard); err != nil {
+		slog.Error("failed to join shard topic", "shard", toShard, "error", err)
+	}
+	if err := sm.clusterMgr.JoinShard(sm.ctx, toShard); err != nil {
+		slog.Error("failed to join cluster for shard", "shard", toShard, "error", err)
+	}
+
+	go sm.schedulePinMigration(fromShard, toShard)
+	go sm.scheduleDelayedLeave(fromShard, toShard)
+	go sm.scheduleReshardPass(fromShard, toShard)
+}
+
+// publishLeaveFromShard announces departure immediately so peers drop us from
+// their active counts, even though the topic stays open for ShardOverlapDuration.
+func (sm *ShardManager) publishLeaveFromShard(fromShard string) {
 	sm.mu.RLock()
-	fromSub, fromSubExists := sm.shardSubs[fromShard]
+	fromSub, exists := sm.shardSubs[fromShard]
 	sm.mu.RUnlock()
-	if fromSubExists && fromSub.topic != nil && !fromSub.observerOnly {
+	if exists && fromSub.topic != nil && !fromSub.observerOnly {
 		leaveMsg := []byte(msgPrefixLeave + sm.h.ID().String())
 		_ = fromSub.topic.Publish(sm.ctx, leaveMsg)
 	}
+}
 
-	if err := sm.JoinShard(toShard); err != nil {
-		slog.Error("failed to join shard topic", "shard", toShard, "error", err)
-	}
-	if err := sm.clusterMgr.JoinShard(sm.ctx, toShard, nil); err != nil {
-		slog.Error("failed to join cluster for shard", "shard", toShard, "error", err)
+func (sm *ShardManager) schedulePinMigration(fromShard, toShard string) {
+	select {
+	case <-sm.ctx.Done():
+		return
+	case <-time.After(migratePinsFlushDelay):
 	}
-	go func() {
-		select {
-		case <-sm.ctx.Done():
-			return
-		case <-time.After(migratePinsFlushDelay):
-		}
-		sm.mu.RLock()
-		current := sm.currentShard
-		sm.mu.RUnlock()
-		if current != toShard {
-			if strings.HasPrefix(current, toShard) {
-				slog.Info("migration redirect", "from", fromShard, "to", current, "intermediate", toShard)
-				if err := sm.clusterMgr.MigratePins(sm.ctx, fromShard, current); err != nil {
-					slog.Error("migration failed", "from", fromShard, "to", current, "error", err)
-				}
+	sm.mu.RLock()
+	current := sm.currentShard
+	sm.mu.RUnlock()
+	if current != toShard {
+		if strings.HasPrefix(current, toShard) {
+			slog.Info("migration redirect", "from", fromShard, "to", current, "intermediate", toShard)
+			if err := sm.clusterMgr.MigratePins(sm.ctx, fromShard, current); err != nil {
+				slog.Error("migration failed", "from", fromShard, "to", current, "error", err)
 			}
-			return
 		}
-		if err := sm.clusterMgr.MigratePins(sm.ctx, fromShard, toShard); err != nil {
-			slog.Error("migration failed", "from", fromShard, "to", toShard, "error", err)
-		}
-	}()
-	go func() {
-		select {
-		case <-sm.ctx.Done():
-			return
-		case <-time.After(sm.cfg.ShardOverlapDuration):
-		}
-		sm.mu.RLock()
-		current := sm.currentShard
-		sm.mu.RUnlock()
-		if current == fromShard {
-			return // we moved back to fromShard, don't leave it
-		}
-		sm.LeaveShard(fromShard)
-		if err := sm.clusterMgr.LeaveShard(fromShard); err != nil {
-			slog.Error("failed to leave cluster", "shard", fromShard, "error", err)
-		}
-	}()
-	go func() {
-		select {
-		case <-sm.ctx.Done():
-			return
-		case <-time.After(sm.cfg.ReshardDelay):
-		}
-		sm.mu.RLock()
-		current := sm.currentShard
-		sm.mu.RUnlock()
-		if current != toShard {
-			return // another transition happened, skip stale reshard
-		}
-		sm.RunReshardPass(fromShard, toShard)
-	}()
+		return
+	}
+	if err := sm.clusterMgr.MigratePins(sm.ctx, fromShard, toShard); err != nil {
+		slog.Error("migration failed", "from", fromShard, "to", toShard, "error", err)
+	}
+}
+
+func (sm *ShardManager) scheduleDelayedLeave(fromShard, toShard string) {
+	select {
+	case <-sm.ctx.Done():
+		return
+	case <-time.After(sm.cfg.Sharding.ShardOverlapDuration):
+	}
+	sm.mu.RLock()
+	current := sm.currentShard
+	sm.mu.RUnlock()
+	if current == fromShard {
+		return
+	}
+	sm.LeaveShard(fromShard)
+	if err := sm.clusterMgr.LeaveShard(fromShard); err != nil {
+		slog.Error("failed to leave cluster", "shard", fromShard, "error", err)
+	}
+}
+
+func (sm *ShardManager) scheduleReshardPass(fromShard, toShard string) {
+	select {
+	case <-sm.ctx.Done():
+		return
+	case <-time.After(sm.cfg.Files.ReshardDelay):
+	}
+	sm.mu.RLock()
+	current := sm.currentShard
+	sm.mu.RUnlock()
+	if current != toShard {
+		return
+	}
+	sm.RunReshardPass(fromShard, toShard)
 }
diff --git a/internal/managers/shard/shard_behavior_test.go b/internal/managers/shard/shard_behavior_test.go
index 7599b5a..93beffa 100644
--- a/internal/managers/shard/shard_behavior_test.go
+++ b/internal/managers/shard/shard_behavior_test.go
@@ -13,7 +13,6 @@ import (
 
 	"dlockss/internal/config"
 	"dlockss/internal/managers/storage"
-	"dlockss/internal/telemetry"
 	"dlockss/internal/testutil"
 )
 
@@ -32,10 +31,9 @@ func newTestShardManager(t *testing.T, ctx context.Context, startShard string) *
 		t.Fatal(err)
 	}
 
-	metrics := telemetry.NewMetricsManager(config.DefaultConfig())
 	dht := &testutil.MockDHTProvider{}
 	cfg := config.DefaultConfig()
-	storageMgr := storage.NewStorageManager(cfg, dht, metrics, nil)
+	storageMgr := storage.NewStorageManager(cfg, dht, nil)
 	clusterMgr := &testutil.MockClusterManager{}
 
 	sm, err := NewShardManager(ShardManagerConfig{
@@ -45,7 +43,6 @@ func newTestShardManager(t *testing.T, ctx context.Context, startShard string) *
 		PubSub:     ps,
 		IPFSClient: &testutil.MockIPFSClient{},
 		Storage:    storageMgr,
-		Metrics:    metrics,
 		Cluster:    clusterMgr,
 		StartShard: startShard,
 	})
@@ -60,7 +57,7 @@ func populateFakeActivePeers(sm *ShardManager, shardID string, count int) []peer
 	var peers []peer.ID
 	for i := 0; i < count; i++ {
 		pid := peer.ID(fmt.Sprintf("fake-active-peer-%d", i))
-		sm.peers.RecordRole(shardID, pid, RoleActive)
+		sm.peers.RecordRole(shardID, pid, roleActive)
 		peers = append(peers, pid)
 	}
 	return peers
@@ -74,19 +71,19 @@ func TestCountActivePeers_OnlyCountsActive(t *testing.T) {
 	sm := newTestShardManager(t, ctx, "0")
 
 	shard := "0"
-	sm.peers.RecordRole(shard, "peer-active-1", RoleActive)
-	sm.peers.RecordRole(shard, "peer-active-2", RoleActive)
-	sm.peers.RecordRole(shard, "peer-passive-1", RolePassive)
-	sm.peers.RecordRole(shard, "peer-probe-1", RoleProbe)
+	sm.peers.RecordRole(shard, "peer-active-1", roleActive)
+	sm.peers.RecordRole(shard, "peer-active-2", roleActive)
+	sm.peers.RecordRole(shard, "peer-passive-1", rolePassive)
+	sm.peers.RecordRole(shard, "peer-probe-1", roleProbe)
 
 	// includeSelf=true: should count 2 active peers + self = 3
-	count := sm.peers.CountActive(shard, true, "0", sm.cfg.SeenPeersWindow)
+	count := sm.peers.CountActive(shard, true, "0", sm.cfg.Sharding.SeenPeersWindow)
 	if count != 3 {
 		t.Errorf("expected 3 (2 active + self), got %d", count)
 	}
 
 	// includeSelf=false: should count only 2 active peers
-	count = sm.peers.CountActive(shard, false, "0", sm.cfg.SeenPeersWindow)
+	count = sm.peers.CountActive(shard, false, "0", sm.cfg.Sharding.SeenPeersWindow)
 	if count != 2 {
 		t.Errorf("expected 2 active peers, got %d", count)
 	}
@@ -99,10 +96,9 @@ func TestCountActivePeers_ExcludesStaleEntries(t *testing.T) {
 
 	shard := "0"
 	now := time.Now()
-	sm.peers.RecordRole(shard, "peer-fresh", RoleActive)
-	// Inject a stale entry by writing directly (RecordRole always uses time.Now())
+	sm.peers.RecordRole(shard, "peer-fresh", roleActive)
 	sm.peers.mu.Lock()
-	sm.peers.roles[shard]["peer-stale"] = PeerRoleInfo{Role: RoleActive, LastSeen: now.Add(-10 * time.Minute)}
+	sm.peers.roles[shard]["peer-stale"] = peerRoleInfo{role: roleActive, lastSeen: now.Add(-10 * time.Minute)}
 	sm.peers.mu.Unlock()
 
 	// With a 5-minute window, only the fresh peer should count
@@ -125,17 +121,17 @@ func TestCountActivePeers_ExcludesSelf(t *testing.T) {
 
 	shard := "0"
 	selfID := sm.h.ID()
-	sm.peers.RecordRole(shard, selfID, RoleActive)
-	sm.peers.RecordRole(shard, "other-peer", RoleActive)
+	sm.peers.RecordRole(shard, selfID, roleActive)
+	sm.peers.RecordRole(shard, "other-peer", roleActive)
 
 	// Self should not be double-counted. With includeSelf=true, self is added once
 	// by the function, not counted from the map.
-	count := sm.peers.CountActive(shard, true, "0", sm.cfg.SeenPeersWindow)
+	count := sm.peers.CountActive(shard, true, "0", sm.cfg.Sharding.SeenPeersWindow)
 	if count != 2 {
 		t.Errorf("expected 2 (1 other + 1 self), got %d", count)
 	}
 
-	count = sm.peers.CountActive(shard, false, "0", sm.cfg.SeenPeersWindow)
+	count = sm.peers.CountActive(shard, false, "0", sm.cfg.Sharding.SeenPeersWindow)
 	if count != 1 {
 		t.Errorf("expected 1 (only other peer), got %d", count)
 	}
@@ -148,11 +144,11 @@ func TestMergeRefusal_HealthyShardEmptySibling(t *testing.T) {
 	defer cancel()
 	sm := newTestShardManager(t, ctx, "00")
 
-	sm.cfg.ProbeTimeoutMerge = 100 * time.Millisecond
-	sm.cfg.MergeUpCooldown = 50 * time.Millisecond
-	sm.cfg.SiblingEmptyMergeAfter = 50 * time.Millisecond
+	sm.cfg.Sharding.ProbeTimeoutMerge = 100 * time.Millisecond
+	sm.cfg.Sharding.MergeUpCooldown = 50 * time.Millisecond
+	sm.cfg.Sharding.SiblingEmptyMergeAfter = 50 * time.Millisecond
 
-	populateFakeActivePeers(sm, "00", sm.cfg.MinPeersPerShard+2)
+	populateFakeActivePeers(sm, "00", sm.cfg.Sharding.MinPeersPerShard+2)
 
 	// Set lastMoveToDeeperShard far enough in the past to pass both cooldown and siblingEmptyMergeAfter
 	sm.mu.Lock()
@@ -174,12 +170,12 @@ func TestMergeAllowed_UnderstaffedShardEmptySibling(t *testing.T) {
 	defer cancel()
 	sm := newTestShardManager(t, ctx, "00")
 
-	sm.cfg.ProbeTimeoutMerge = 100 * time.Millisecond
-	sm.cfg.MergeUpCooldown = 50 * time.Millisecond
-	sm.cfg.SiblingEmptyMergeAfter = 50 * time.Millisecond
+	sm.cfg.Sharding.ProbeTimeoutMerge = 100 * time.Millisecond
+	sm.cfg.Sharding.MergeUpCooldown = 50 * time.Millisecond
+	sm.cfg.Sharding.SiblingEmptyMergeAfter = 50 * time.Millisecond
 
-	if sm.cfg.MinPeersPerShard > 2 {
-		populateFakeActivePeers(sm, "00", sm.cfg.MinPeersPerShard-2)
+	if sm.cfg.Sharding.MinPeersPerShard > 2 {
+		populateFakeActivePeers(sm, "00", sm.cfg.Sharding.MinPeersPerShard-2)
 	}
 
 	// Set lastMoveToDeeperShard far enough in the past
@@ -202,8 +198,8 @@ func TestMergeRefusal_CooldownPreventsEarlyMerge(t *testing.T) {
 	defer cancel()
 	sm := newTestShardManager(t, ctx, "00")
 
-	sm.cfg.ProbeTimeoutMerge = 100 * time.Millisecond
-	sm.cfg.MergeUpCooldown = 10 * time.Minute
+	sm.cfg.Sharding.ProbeTimeoutMerge = 100 * time.Millisecond
+	sm.cfg.Sharding.MergeUpCooldown = 10 * time.Minute
 
 	// Set lastMoveToDeeperShard to very recently (within cooldown)
 	sm.mu.Lock()
@@ -255,10 +251,9 @@ func TestMoveToShard_PublishesLeave(t *testing.T) {
 	}
 
 	// Set up ShardManager on h1 starting in shard "0"
-	metrics := telemetry.NewMetricsManager(config.DefaultConfig())
 	dht := &testutil.MockDHTProvider{}
 	cfg1 := config.DefaultConfig()
-	storageMgr := storage.NewStorageManager(cfg1, dht, metrics, nil)
+	storageMgr := storage.NewStorageManager(cfg1, dht, nil)
 	clusterMgr := &testutil.MockClusterManager{}
 	sm, err := NewShardManager(ShardManagerConfig{
 		Cfg:        cfg1,
@@ -267,7 +262,6 @@ func TestMoveToShard_PublishesLeave(t *testing.T) {
 		PubSub:     ps1,
 		IPFSClient: &testutil.MockIPFSClient{},
 		Storage:    storageMgr,
-		Metrics:    metrics,
 		Cluster:    clusterMgr,
 		StartShard: "0",
 	})
@@ -355,10 +349,9 @@ func TestProcessMessage_ProbeTriggersHeartbeat(t *testing.T) {
 	}
 
 	// Set up ShardManager on h1 in shard "0"
-	metrics := telemetry.NewMetricsManager(config.DefaultConfig())
 	dht := &testutil.MockDHTProvider{}
 	cfg2 := config.DefaultConfig()
-	storageMgr := storage.NewStorageManager(cfg2, dht, metrics, nil)
+	storageMgr := storage.NewStorageManager(cfg2, dht, nil)
 	clusterMgr := &testutil.MockClusterManager{}
 	sm, err := NewShardManager(ShardManagerConfig{
 		Cfg:        cfg2,
@@ -367,7 +360,6 @@ func TestProcessMessage_ProbeTriggersHeartbeat(t *testing.T) {
 		PubSub:     ps1,
 		IPFSClient: &testutil.MockIPFSClient{},
 		Storage:    storageMgr,
-		Metrics:    metrics,
 		Cluster:    clusterMgr,
 		StartShard: "0",
 	})
diff --git a/internal/managers/shard/shard_discovery.go b/internal/managers/shard/shard_discovery.go
deleted file mode 100644
index 8e19a71..0000000
--- a/internal/managers/shard/shard_discovery.go
+++ /dev/null
@@ -1,9 +0,0 @@
-package shard
-
-import "time"
-
-const (
-	discoveryIntervalOnRoot       = 10 * time.Second
-	probeTimeoutDiscovery         = 12 * time.Second
-	discoveryIntervalWithChildren = 45 * time.Second
-)
diff --git a/internal/managers/shard/shard_legacy.go b/internal/managers/shard/shard_legacy.go
new file mode 100644
index 0000000..eedf1bb
--- /dev/null
+++ b/internal/managers/shard/shard_legacy.go
@@ -0,0 +1,74 @@
+package shard
+
+import (
+	"log/slog"
+	"time"
+
+	"github.com/ipfs/go-cid"
+
+	"dlockss/internal/common"
+)
+
+const legacyCleanupInterval = 5 * time.Minute
+
+func (sm *ShardManager) runLegacyManifestCleanup() {
+	select {
+	case <-sm.ctx.Done():
+		return
+	case <-time.After(30 * time.Second):
+	}
+	sm.cleanupLegacyManifests()
+
+	ticker := time.NewTicker(legacyCleanupInterval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-sm.ctx.Done():
+			return
+		case <-ticker.C:
+			sm.cleanupLegacyManifests()
+		}
+	}
+}
+
+func (sm *ShardManager) cleanupLegacyManifests() {
+	manifests := sm.storageMgr.GetPinnedManifests()
+	if len(manifests) == 0 {
+		return
+	}
+
+	sm.mu.RLock()
+	currentShard := sm.currentShard
+	sm.mu.RUnlock()
+
+	removed := 0
+	for _, manifestCIDStr := range manifests {
+		select {
+		case <-sm.ctx.Done():
+			return
+		default:
+		}
+		if !common.IsLegacyManifest(sm.ctx, sm.ipfsClient, manifestCIDStr) {
+			continue
+		}
+		manifestCID, err := cid.Decode(manifestCIDStr)
+		if err != nil {
+			continue
+		}
+		slog.Info("removing legacy manifest", "manifest", manifestCIDStr)
+		if currentShard != "" {
+			if err := sm.clusterMgr.Unpin(sm.ctx, currentShard, manifestCID); err != nil {
+				slog.Error("cluster unpin failed for legacy manifest", "manifest", manifestCIDStr, "error", err)
+			}
+		}
+		if err := sm.ipfsClient.UnpinRecursive(sm.ctx, manifestCID); err != nil {
+			slog.Error("IPFS unpin failed for legacy manifest", "manifest", manifestCIDStr, "error", err)
+		}
+		sm.storageMgr.UnpinFile(manifestCIDStr)
+		removed++
+		time.Sleep(50 * time.Millisecond)
+	}
+	if removed > 0 {
+		slog.Info("legacy manifest cleanup complete", "removed", removed)
+	}
+}
diff --git a/internal/managers/shard/shard_lifecycle.go b/internal/managers/shard/shard_lifecycle.go
index 62872cf..8102870 100644
--- a/internal/managers/shard/shard_lifecycle.go
+++ b/internal/managers/shard/shard_lifecycle.go
@@ -14,7 +14,12 @@ import (
 	"github.com/libp2p/go-libp2p/core/peer"
 )
 
-const probeTimeoutForSplitChild = 6 * time.Second
+const (
+	probeTimeoutForSplitChild     = 6 * time.Second
+	discoveryIntervalOnRoot       = 10 * time.Second
+	probeTimeoutDiscovery         = 12 * time.Second
+	discoveryIntervalWithChildren = 45 * time.Second
+)
 
 // lifecycleOps is the narrow interface that lifecycleManager uses to query
 // shard state and execute transitions.  ShardManager implements it.
@@ -26,14 +31,12 @@ type lifecycleOps interface {
 	getLastMessageTime() time.Time
 	localPeerID() peer.ID
 
-	getShardPeerCount() int
-	getShardPeerCountForSplit() int
+	getShardPeerCount(useMeshFallback bool) int
 	probeShard(shardID string, timeout time.Duration) int
 
 	moveToShard(from, to string, isMergeUp bool)
 	announceSplit(parentShard, targetChild string)
 	rebroadcastSplitToAncestors()
-	incrementShardSplits()
 
 	pruneStaleSeenPeers()
 }
@@ -106,11 +109,11 @@ func (lm *lifecycleManager) checkAndSplitIfNeeded() {
 	currentShard := lm.ops.getCurrentShard()
 
 	lastShardMove := lm.ops.getLastShardMove()
-	if !lastShardMove.IsZero() && now.Sub(lastShardMove) < lm.cfg.ShardMoveCooldown {
+	if !lastShardMove.IsZero() && now.Sub(lastShardMove) < lm.cfg.Sharding.ShardMoveCooldown {
 		return
 	}
 
-	interval := lm.cfg.ShardPeerCheckInterval
+	interval := lm.cfg.Sharding.ShardPeerCheckInterval
 	if currentShard == "" {
 		interval = rootPeerCheckInterval
 	}
@@ -122,8 +125,8 @@ func (lm *lifecycleManager) checkAndSplitIfNeeded() {
 	lm.lastPeerCheck = now
 	lm.mu.Unlock()
 
-	peerCount := lm.ops.getShardPeerCountForSplit()
-	if peerCount < lm.cfg.MaxPeersPerShard {
+	peerCount := lm.ops.getShardPeerCount(false)
+	if peerCount < lm.cfg.Sharding.MaxPeersPerShard {
 		lm.mu.Lock()
 		lm.splitAboveThresholdCount = 0
 		lm.mu.Unlock()
@@ -135,13 +138,13 @@ func (lm *lifecycleManager) checkAndSplitIfNeeded() {
 	count := lm.splitAboveThresholdCount
 	lm.mu.Unlock()
 	if count < 2 {
-		slog.Debug("waiting for 2nd consecutive check before split", "shard", currentShard, "peers", peerCount, "max_peers", lm.cfg.MaxPeersPerShard)
+		slog.Debug("waiting for 2nd consecutive check before split", "shard", currentShard, "peers", peerCount, "max_peers", lm.cfg.Sharding.MaxPeersPerShard)
 		return
 	}
 
 	estimatedPerChild := peerCount / 2
-	if estimatedPerChild < lm.cfg.MinPeersPerShard {
-		slog.Debug("split would leave too few peers per child", "shard", currentShard, "peers", peerCount, "max_peers", lm.cfg.MaxPeersPerShard, "estimated_per_child", estimatedPerChild, "min_peers", lm.cfg.MinPeersPerShard)
+	if estimatedPerChild < lm.cfg.Sharding.MinPeersPerShard {
+		slog.Debug("split would leave too few peers per child", "shard", currentShard, "peers", peerCount, "max_peers", lm.cfg.Sharding.MaxPeersPerShard, "estimated_per_child", estimatedPerChild, "min_peers", lm.cfg.Sharding.MinPeersPerShard)
 		return
 	}
 
@@ -150,7 +153,7 @@ func (lm *lifecycleManager) checkAndSplitIfNeeded() {
 	childPeerCount := lm.ops.probeShard(targetChild, probeTimeoutForSplitChild)
 
 	canJoinExisting := childPeerCount >= 1
-	minParentToCreate := 2 * lm.cfg.MinPeersPerShard
+	minParentToCreate := 2 * lm.cfg.Sharding.MinPeersPerShard
 	minParentToCreateNew := minParentToCreate + 2
 	canCreateChild := childPeerCount == 0 && peerCount >= minParentToCreateNew
 	if !canJoinExisting && !canCreateChild {
@@ -159,7 +162,6 @@ func (lm *lifecycleManager) checkAndSplitIfNeeded() {
 	}
 
 	slog.Info("shard at limit, splitting", "shard", currentShard, "peers", peerCount, "child", targetChild, "child_peers", childPeerCount)
-	lm.ops.incrementShardSplits()
 	lm.ops.announceSplit(currentShard, targetChild)
 	lm.ops.moveToShard(currentShard, targetChild, false)
 }
@@ -173,43 +175,43 @@ func (lm *lifecycleManager) checkAndMergeUpIfAlone() {
 	}
 
 	lastAnyMove := lm.ops.getLastShardMove()
-	if !lastAnyMove.IsZero() && time.Since(lastAnyMove) < lm.cfg.ShardMoveCooldown {
+	if !lastAnyMove.IsZero() && time.Since(lastAnyMove) < lm.cfg.Sharding.ShardMoveCooldown {
 		return
 	}
 
 	lastMove := lm.ops.getLastMoveToDeeperShard()
 	parentShard := currentShard[:len(currentShard)-1]
-	if !lastMove.IsZero() && time.Since(lastMove) < lm.cfg.MergeUpCooldown {
-		slog.Debug("merge-up skipped, moved to deeper shard recently", "cooldown_elapsed", time.Since(lastMove).Round(time.Second), "cooldown", lm.cfg.MergeUpCooldown)
+	if !lastMove.IsZero() && time.Since(lastMove) < lm.cfg.Sharding.MergeUpCooldown {
+		slog.Debug("merge-up skipped, moved to deeper shard recently", "cooldown_elapsed", time.Since(lastMove).Round(time.Second), "cooldown", lm.cfg.Sharding.MergeUpCooldown)
 		return
 	}
 
-	currentPeerCount := lm.ops.getShardPeerCount()
-	parentPeerCount := lm.ops.probeShard(parentShard, lm.cfg.ProbeTimeoutMerge)
-	if parentPeerCount >= lm.cfg.MaxPeersPerShard {
+	currentPeerCount := lm.ops.getShardPeerCount(true)
+	parentPeerCount := lm.ops.probeShard(parentShard, lm.cfg.Sharding.ProbeTimeoutMerge)
+	if parentPeerCount >= lm.cfg.Sharding.MaxPeersPerShard {
 		return
 	}
 
 	siblingShard := getSiblingShard(currentShard)
-	siblingPeerCount := lm.ops.probeShard(siblingShard, lm.cfg.ProbeTimeoutMerge)
+	siblingPeerCount := lm.ops.probeShard(siblingShard, lm.cfg.Sharding.ProbeTimeoutMerge)
 	siblingsTotal := currentPeerCount + siblingPeerCount
 
 	if siblingPeerCount == 0 {
-		if lastMove.IsZero() || time.Since(lastMove) < lm.cfg.SiblingEmptyMergeAfter {
+		if lastMove.IsZero() || time.Since(lastMove) < lm.cfg.Sharding.SiblingEmptyMergeAfter {
 			slog.Debug("sibling empty, possible split in progress", "shard", currentShard, "peers", currentPeerCount, "sibling", siblingShard)
 			return
 		}
-		if currentPeerCount >= lm.cfg.MinPeersPerShard {
-			slog.Debug("sibling empty but we are healthy, not merging", "shard", currentShard, "peers", currentPeerCount, "min_peers", lm.cfg.MinPeersPerShard, "sibling", siblingShard)
+		if currentPeerCount >= lm.cfg.Sharding.MinPeersPerShard {
+			slog.Debug("sibling empty but we are healthy, not merging", "shard", currentShard, "peers", currentPeerCount, "min_peers", lm.cfg.Sharding.MinPeersPerShard, "sibling", siblingShard)
 			return
 		}
-		slog.Info("merging up, sibling empty too long", "shard", currentShard, "peers", currentPeerCount, "min_peers", lm.cfg.MinPeersPerShard, "sibling", siblingShard, "empty_after", lm.cfg.SiblingEmptyMergeAfter, "target", parentShard)
+		slog.Info("merging up, sibling empty too long", "shard", currentShard, "peers", currentPeerCount, "min_peers", lm.cfg.Sharding.MinPeersPerShard, "sibling", siblingShard, "empty_after", lm.cfg.Sharding.SiblingEmptyMergeAfter, "target", parentShard)
 		lm.ops.moveToShard(currentShard, parentShard, true)
 		return
 	}
 
-	if siblingsTotal >= lm.cfg.MinPeersAcrossSiblings {
-		slog.Debug("siblings have enough peers, not merging", "shard", currentShard, "peers", currentPeerCount, "sibling", siblingShard, "sibling_peers", siblingPeerCount, "total", siblingsTotal, "min_across_siblings", lm.cfg.MinPeersAcrossSiblings)
+	if siblingsTotal >= lm.cfg.Sharding.MinPeersAcrossSiblings {
+		slog.Debug("siblings have enough peers, not merging", "shard", currentShard, "peers", currentPeerCount, "sibling", siblingShard, "sibling_peers", siblingPeerCount, "total", siblingsTotal, "min_across_siblings", lm.cfg.Sharding.MinPeersAcrossSiblings)
 		return
 	}
 	if siblingPeerCount > 0 && currentPeerCount > siblingPeerCount {
@@ -217,7 +219,7 @@ func (lm *lifecycleManager) checkAndMergeUpIfAlone() {
 		return
 	}
 
-	slog.Info("siblings below threshold, merging up", "total", siblingsTotal, "min_across_siblings", lm.cfg.MinPeersAcrossSiblings, "shard", currentShard, "peers", currentPeerCount, "sibling", siblingShard, "sibling_peers", siblingPeerCount, "target", parentShard)
+	slog.Info("siblings below threshold, merging up", "total", siblingsTotal, "min_across_siblings", lm.cfg.Sharding.MinPeersAcrossSiblings, "shard", currentShard, "peers", currentPeerCount, "sibling", siblingShard, "sibling_peers", siblingPeerCount, "target", parentShard)
 
 	lm.ops.moveToShard(currentShard, parentShard, true)
 }
@@ -228,13 +230,13 @@ func (lm *lifecycleManager) discoverAndMoveToDeeperShard() {
 	currentShard := lm.ops.getCurrentShard()
 
 	lastAnyMove := lm.ops.getLastShardMove()
-	if !lastAnyMove.IsZero() && time.Since(lastAnyMove) < lm.cfg.ShardMoveCooldown {
+	if !lastAnyMove.IsZero() && time.Since(lastAnyMove) < lm.cfg.Sharding.ShardMoveCooldown {
 		return
 	}
 
 	lastMerge := lm.ops.getLastMergeUpTime()
-	if !lastMerge.IsZero() && time.Since(lastMerge) < lm.cfg.MergeUpCooldown {
-		slog.Debug("skipped discovery, merged recently", "shard", currentShard, "cooldown_elapsed", time.Since(lastMerge).Round(time.Second), "cooldown", lm.cfg.MergeUpCooldown)
+	if !lastMerge.IsZero() && time.Since(lastMerge) < lm.cfg.Sharding.MergeUpCooldown {
+		slog.Debug("skipped discovery, merged recently", "shard", currentShard, "cooldown_elapsed", time.Since(lastMerge).Round(time.Second), "cooldown", lm.cfg.Sharding.MergeUpCooldown)
 		return
 	}
 
@@ -266,13 +268,13 @@ func (lm *lifecycleManager) discoverAndMoveToDeeperShard() {
 	siblingPeerCount := lm.ops.probeShard(siblingShard, probeTimeoutDiscovery)
 	ourChildAfter := childPeerCount + 1
 	pairTotalAfter := ourChildAfter + siblingPeerCount
-	if pairTotalAfter < lm.cfg.MinPeersAcrossSiblings {
-		parentPeerCount := lm.ops.getShardPeerCount()
+	if pairTotalAfter < lm.cfg.Sharding.MinPeersAcrossSiblings {
+		parentPeerCount := lm.ops.getShardPeerCount(true)
 		projectedPairTotal := pairTotalAfter + (parentPeerCount - 1)
-		if projectedPairTotal >= lm.cfg.MinPeersAcrossSiblings {
-			slog.Info("pair total below threshold but projected allows join", "shard", currentShard, "pair_total", pairTotalAfter, "min_across_siblings", lm.cfg.MinPeersAcrossSiblings, "projected_total", projectedPairTotal, "parent_peers", parentPeerCount)
+		if projectedPairTotal >= lm.cfg.Sharding.MinPeersAcrossSiblings {
+			slog.Info("pair total below threshold but projected allows join", "shard", currentShard, "pair_total", pairTotalAfter, "min_across_siblings", lm.cfg.Sharding.MinPeersAcrossSiblings, "projected_total", projectedPairTotal, "parent_peers", parentPeerCount)
 		} else {
-			slog.Debug("pair total below threshold, not joining", "shard", currentShard, "child", targetChild, "child_after_join", ourChildAfter, "sibling", siblingShard, "sibling_peers", siblingPeerCount, "pair_total", pairTotalAfter, "projected_total", projectedPairTotal, "parent_peers", parentPeerCount, "min_across_siblings", lm.cfg.MinPeersAcrossSiblings)
+			slog.Debug("pair total below threshold, not joining", "shard", currentShard, "child", targetChild, "child_after_join", ourChildAfter, "sibling", siblingShard, "sibling_peers", siblingPeerCount, "pair_total", pairTotalAfter, "projected_total", projectedPairTotal, "parent_peers", parentPeerCount, "min_across_siblings", lm.cfg.Sharding.MinPeersAcrossSiblings)
 			return
 		}
 	}
@@ -302,7 +304,7 @@ func (lm *lifecycleManager) runShardDiscovery() {
 	for {
 		currentShard := lm.ops.getCurrentShard()
 
-		interval := lm.cfg.ShardDiscoveryInterval
+		interval := lm.cfg.Sharding.ShardDiscoveryInterval
 		if currentShard == "" {
 			interval = discoveryIntervalOnRoot
 		} else if lm.hasKnownChildren(currentShard) {
@@ -323,8 +325,8 @@ func (lm *lifecycleManager) runShardDiscovery() {
 			lt := lm.ops.getLastMessageTime()
 			return lt.IsZero() || time.Since(lt) > 1*time.Minute
 		}()
-		peerCount := lm.ops.getShardPeerCountForSplit()
-		fewPeersInShard := peerCount <= lm.cfg.MaxPeersPerShard
+		peerCount := lm.ops.getShardPeerCount(false)
+		fewPeersInShard := peerCount <= lm.cfg.Sharding.MaxPeersPerShard
 		onRoot := currentShard == ""
 		hasChildren := lm.hasKnownChildren(currentShard)
 		if !hasChildren && !isIdle && !fewPeersInShard && !onRoot {
@@ -338,12 +340,12 @@ func (lm *lifecycleManager) runShardDiscovery() {
 }
 
 func (lm *lifecycleManager) runSplitRebroadcast() {
-	jitterRange := lm.cfg.ShardSplitRebroadcastInterval / 2
+	jitterRange := lm.cfg.Sharding.ShardSplitRebroadcastInterval / 2
 	if jitterRange < time.Second {
 		jitterRange = time.Second
 	}
 	for {
-		delay := lm.cfg.ShardSplitRebroadcastInterval + time.Duration(rand.Int63n(int64(jitterRange)))
+		delay := lm.cfg.Sharding.ShardSplitRebroadcastInterval + time.Duration(rand.Int63n(int64(jitterRange)))
 		t := time.NewTimer(delay)
 		select {
 		case <-lm.ctx().Done():
@@ -354,12 +356,3 @@ func (lm *lifecycleManager) runSplitRebroadcast() {
 		}
 	}
 }
-
-// splitShard is a test helper that forces a split to the target child.
-func (lm *lifecycleManager) splitShard() {
-	currentShard := lm.ops.getCurrentShard()
-	nextDepth := len(currentShard) + 1
-	targetChild := common.GetBinaryPrefix(lm.ops.localPeerID().String(), nextDepth)
-	lm.ops.incrementShardSplits()
-	lm.ops.moveToShard(currentShard, targetChild, false)
-}
diff --git a/internal/managers/shard/shard_loops.go b/internal/managers/shard/shard_loops.go
index bdddf50..73105f0 100644
--- a/internal/managers/shard/shard_loops.go
+++ b/internal/managers/shard/shard_loops.go
@@ -5,160 +5,27 @@ import (
 	"context"
 	"fmt"
 	"log/slog"
-	"sync"
-	"sync/atomic"
 	"time"
 
+	"dlockss/internal/common"
+	"dlockss/pkg/schema"
+
 	"github.com/ipfs/go-cid"
 	"github.com/ipld/go-ipld-prime/codec/dagcbor"
 	"github.com/ipld/go-ipld-prime/node/basicnode"
 	pubsub "github.com/libp2p/go-libp2p-pubsub"
 	"github.com/libp2p/go-libp2p/core/peer"
-
-	"dlockss/pkg/schema"
 )
 
 const probeResponseCooldown = 5 * time.Second
 
-// pruneReplicationRequestCooldown removes stale entries from the cooldown map.
-func (sm *ShardManager) pruneReplicationRequestCooldown() {
-	sm.replicationRequestMu.Lock()
-	defer sm.replicationRequestMu.Unlock()
-	cutoff := time.Now().Add(-2 * replicationRequestCooldownDuration)
-	for cidStr, lastSent := range sm.replicationRequestLastSent {
-		if lastSent.Before(cutoff) {
-			delete(sm.replicationRequestLastSent, cidStr)
-		}
-	}
-}
-
-// runReplicationChecker sends ReplicationRequest for pinned files below target replication.
-func (sm *ShardManager) runReplicationChecker() {
-	if sm.cfg.CheckInterval <= 0 {
-		return
-	}
-	ticker := time.NewTicker(rootReplicationCheckInterval)
-	defer ticker.Stop()
-
-	var lastReplicationCheck time.Time
-	for {
-		select {
-		case <-sm.ctx.Done():
-			return
-		case <-ticker.C:
-			sm.mu.RLock()
-			currentShard := sm.currentShard
-			sm.mu.RUnlock()
-
-			interval := sm.cfg.CheckInterval
-			if currentShard == "" {
-				interval = rootReplicationCheckInterval
-			}
-			if time.Since(lastReplicationCheck) < interval {
-				continue
-			}
-			lastReplicationCheck = time.Now()
-
-			manifests := sm.storageMgr.GetPinnedManifests()
-			if len(manifests) == 0 {
-				continue
-			}
-
-			sm.pruneReplicationRequestCooldown()
-
-			maxConc := sm.cfg.MaxConcurrentReplicationChecks
-			if maxConc < 1 {
-				maxConc = 1
-			}
-			sem := make(chan struct{}, maxConc)
-			var wg sync.WaitGroup
-			var sentThisCycle int32
-			for _, manifestCIDStr := range manifests {
-				select {
-				case <-sm.ctx.Done():
-					wg.Wait()
-					return
-				case sem <- struct{}{}:
-				}
-				if atomic.LoadInt32(&sentThisCycle) >= maxReplicationRequestsPerCycle {
-					<-sem
-					continue
-				}
-				wg.Add(1)
-				go func(manifestCIDStr string) {
-					defer wg.Done()
-					defer func() { <-sem }()
-					c, err := cid.Decode(manifestCIDStr)
-					if err != nil {
-						return
-					}
-					allocations, err := sm.clusterMgr.GetAllocations(sm.ctx, currentShard, c)
-					if err != nil {
-						_ = sm.clusterMgr.Pin(sm.ctx, currentShard, c, -1, -1)
-						allocations = nil
-					}
-					peerCount := sm.getShardPeerCount()
-					targetRep := sm.cfg.MaxReplication
-					if peerCount > 0 && targetRep > peerCount {
-						targetRep = peerCount
-					}
-					currentPeers := sm.GetPeersForShard(currentShard)
-					currentSet := make(map[peer.ID]struct{}, len(currentPeers)+1)
-					currentSet[sm.h.ID()] = struct{}{}
-					for _, p := range currentPeers {
-						currentSet[p] = struct{}{}
-					}
-					activeAllocations := 0
-					for _, a := range allocations {
-						if _, ok := currentSet[a]; ok {
-							activeAllocations++
-						}
-					}
-					if activeAllocations >= targetRep {
-						return
-					}
-					if atomic.LoadInt32(&sentThisCycle) >= maxReplicationRequestsPerCycle {
-						return
-					}
-					sm.replicationRequestMu.Lock()
-					lastSent := sm.replicationRequestLastSent[manifestCIDStr]
-					if time.Since(lastSent) < replicationRequestCooldownDuration {
-						sm.replicationRequestMu.Unlock()
-						return
-					}
-					sm.replicationRequestLastSent[manifestCIDStr] = time.Now()
-					sm.replicationRequestMu.Unlock()
-					if sm.signer == nil {
-						return
-					}
-					rr := &schema.ReplicationRequest{
-						SignedEnvelope: schema.SignedEnvelope{Type: schema.MessageTypeReplicationRequest, ManifestCID: c},
-					}
-					if err := sm.signer.SignProtocolMessage(rr); err != nil {
-						slog.Error("failed to sign ReplicationRequest", "manifest", manifestCIDStr, "error", err)
-						return
-					}
-					b, err := rr.MarshalCBOR()
-					if err != nil {
-						return
-					}
-					sm.PublishToShardCBOR(b, currentShard)
-					atomic.AddInt32(&sentThisCycle, 1)
-					slog.Debug("ReplicationRequest sent", "manifest", manifestCIDStr, "shard", currentShard, "active_alloc", activeAllocations, "total_alloc", len(allocations), "target", targetRep, "peers", peerCount)
-				}(manifestCIDStr)
-			}
-			wg.Wait()
-		}
-	}
-}
-
 // runHeartbeat periodically sends heartbeat messages to the current shard topic.
 func (sm *ShardManager) runHeartbeat() {
 	var heartbeatInterval time.Duration
 	if sm.cfg.HeartbeatInterval > 0 {
 		heartbeatInterval = sm.cfg.HeartbeatInterval
 	} else {
-		heartbeatInterval = sm.cfg.ShardPeerCheckInterval / 3
+		heartbeatInterval = sm.cfg.Sharding.ShardPeerCheckInterval / 3
 		if heartbeatInterval < 10*time.Second {
 			heartbeatInterval = 10 * time.Second
 		}
@@ -236,7 +103,7 @@ func (sm *ShardManager) reprovideNextPinnedFile() {
 		}
 		pinCancel()
 
-		pctx, pcancel := context.WithTimeout(sm.ctx, sm.cfg.DHTProvideTimeout)
+		pctx, pcancel := context.WithTimeout(sm.ctx, sm.cfg.Files.DHTProvideTimeout)
 		defer pcancel()
 		sm.storageMgr.ProvideFile(pctx, manifestCIDStr)
 
@@ -258,7 +125,7 @@ func (sm *ShardManager) reprovideNextPinnedFile() {
 		if err := sm.ipfsClient.PinRecursive(sm.ctx, payloadCID); err != nil {
 			slog.Debug("reprovide pin payload failed", "payload", payloadCID, "error", err)
 		}
-		pctx2, pcancel2 := context.WithTimeout(sm.ctx, sm.cfg.DHTProvideTimeout)
+		pctx2, pcancel2 := context.WithTimeout(sm.ctx, sm.cfg.Files.DHTProvideTimeout)
 		defer pcancel2()
 		sm.storageMgr.ProvideFile(pctx2, payloadCID.String())
 	}()
@@ -275,7 +142,6 @@ func (sm *ShardManager) announcePinnedFilesBatch(topic *pubsub.Topic, batchSize
 	}
 }
 
-// processMessage decodes CBOR and dispatches to Ingest or ReplicationRequest handler.
 func (sm *ShardManager) processMessage(msg *pubsub.Message, shardID string) {
 	if msg.GetFrom() == sm.h.ID() {
 		return
@@ -288,72 +154,52 @@ func (sm *ShardManager) processMessage(msg *pubsub.Message, shardID string) {
 	sm.lastMessageTime = now
 	sm.mu.Unlock()
 
-	if len(msg.Data) > 0 {
-		if msg.Data[0] == '{' {
-			return
-		}
-		if bytes.HasPrefix(msg.Data, []byte(msgPrefixHeartbeat)) {
-			sm.peers.RecordRole(shardID, from, parseHeartbeatRole(msg.Data))
-			return
-		}
-		if bytes.HasPrefix(msg.Data, []byte(msgPrefixPinned)) {
-			key := string(msg.Data[len(msgPrefixPinned):])
-			sm.storageMgr.AddKnownFile(key)
-			return
-		}
-		if bytes.HasPrefix(msg.Data, []byte(msgPrefixJoin)) {
-			sm.peers.RecordRole(shardID, from, parseJoinRole(msg.Data))
-			return
-		}
-		if bytes.HasPrefix(msg.Data, []byte(msgPrefixLeave)) {
-			sm.peers.RemoveRole(shardID, from)
-			return
-		}
-		if bytes.HasPrefix(msg.Data, []byte(msgPrefixProbe)) {
-			sm.peers.RecordRole(shardID, from, RoleProbe)
-
-			// Rate-limit heartbeat responses to PROBEs to avoid "heartbeat storms".
-			sm.mu.Lock()
-			probeRateLimited := !sm.lastProbeResponseTime.IsZero() && now.Sub(sm.lastProbeResponseTime) < probeResponseCooldown
-			if !probeRateLimited {
-				sm.lastProbeResponseTime = now
-			}
-			sm.mu.Unlock()
-
-			if probeRateLimited {
-				return
-			}
-
-			sm.mu.RLock()
-			cs := sm.currentShard
-			probeSub, probeSubExists := sm.shardSubs[shardID]
-			sm.mu.RUnlock()
-			if shardID == cs && probeSubExists && probeSub.topic != nil && !probeSub.observerOnly {
-				pinnedCount := 0
-				if sm.storageMgr != nil {
-					pinnedCount = sm.storageMgr.GetPinnedCount()
-				}
-				role := sm.getOurRole()
-				hb := []byte(fmt.Sprintf("HEARTBEAT:%s:%d:%s:%s", sm.h.ID().String(), pinnedCount, role, sm.nodeName))
-				_ = probeSub.topic.Publish(sm.ctx, hb)
-			}
-			return
-		}
-		if bytes.HasPrefix(msg.Data, []byte(msgPrefixSplit)) {
-			sm.lifecycle.recordSplitAnnouncement(string(msg.Data[len(msgPrefixSplit):]))
-			return
-		}
+	if sm.processTextProtocol(msg, shardID, from, now) {
+		return
 	}
 
-	if sm.rateLimiter != nil && !sm.rateLimiter.Check(msg.GetFrom()) {
-		sm.metrics.IncrementMessagesDropped()
+	if sm.rateLimiter != nil && !sm.rateLimiter.Check(from) {
 		return
 	}
+	sm.processCBORMessage(msg, shardID)
+}
 
-	sm.mu.Lock()
-	sm.msgCounter++
-	sm.mu.Unlock()
+func (sm *ShardManager) processTextProtocol(msg *pubsub.Message, shardID string, from peer.ID, now time.Time) bool {
+	if len(msg.Data) == 0 {
+		return false
+	}
+	if msg.Data[0] == '{' {
+		return true
+	}
+	if bytes.HasPrefix(msg.Data, []byte(msgPrefixHeartbeat)) {
+		sm.peers.RecordRole(shardID, from, parseHeartbeatRole(msg.Data))
+		return true
+	}
+	if bytes.HasPrefix(msg.Data, []byte(msgPrefixPinned)) {
+		key := string(msg.Data[len(msgPrefixPinned):])
+		sm.storageMgr.AddKnownFile(key)
+		return true
+	}
+	if bytes.HasPrefix(msg.Data, []byte(msgPrefixJoin)) {
+		sm.peers.RecordRole(shardID, from, parseJoinRole(msg.Data))
+		return true
+	}
+	if bytes.HasPrefix(msg.Data, []byte(msgPrefixLeave)) {
+		sm.peers.RemoveRole(shardID, from)
+		return true
+	}
+	if bytes.HasPrefix(msg.Data, []byte(msgPrefixProbe)) {
+		sm.handleProbeMessage(shardID, from, now)
+		return true
+	}
+	if bytes.HasPrefix(msg.Data, []byte(msgPrefixSplit)) {
+		sm.lifecycle.recordSplitAnnouncement(string(msg.Data[len(msgPrefixSplit):]))
+		return true
+	}
+	return false
+}
 
+func (sm *ShardManager) processCBORMessage(msg *pubsub.Message, shardID string) {
 	msgType, err := decodeCBORMessageType(msg.Data)
 	if err != nil {
 		slog.Error("failed to decode message type", "from", msg.GetFrom().String(), "shard", shardID, "error", err)
@@ -374,7 +220,36 @@ func (sm *ShardManager) processMessage(msg *pubsub.Message, shardID string) {
 			slog.Error("failed to unmarshal ReplicationRequest", "from", msg.GetFrom().String(), "shard", shardID, "error", err)
 			return
 		}
-		sm.handleReplicationRequest(msg, &rr, shardID)
+		sm.repl.handleRequest(msg, &rr, shardID)
+	}
+}
+
+func (sm *ShardManager) handleProbeMessage(shardID string, from peer.ID, now time.Time) {
+	sm.peers.RecordRole(shardID, from, roleProbe)
+
+	sm.mu.Lock()
+	probeRateLimited := !sm.lastProbeResponseTime.IsZero() && now.Sub(sm.lastProbeResponseTime) < probeResponseCooldown
+	if !probeRateLimited {
+		sm.lastProbeResponseTime = now
+	}
+	sm.mu.Unlock()
+
+	if probeRateLimited {
+		return
+	}
+
+	sm.mu.RLock()
+	cs := sm.currentShard
+	probeSub, probeSubExists := sm.shardSubs[shardID]
+	sm.mu.RUnlock()
+	if shardID == cs && probeSubExists && probeSub.topic != nil && !probeSub.observerOnly {
+		pinnedCount := 0
+		if sm.storageMgr != nil {
+			pinnedCount = sm.storageMgr.GetPinnedCount()
+		}
+		role := sm.getOurRole()
+		hb := []byte(fmt.Sprintf("HEARTBEAT:%s:%d:%s:%s", sm.h.ID().String(), pinnedCount, role, sm.nodeName))
+		_ = probeSub.topic.Publish(sm.ctx, hb)
 	}
 }
 
@@ -394,3 +269,43 @@ func decodeCBORMessageType(data []byte) (schema.MessageType, error) {
 	}
 	return schema.MessageType(ti), nil
 }
+
+func (sm *ShardManager) runReannouncePinsLoop() {
+	if sm.cfg.Replication.PinReannounceInterval <= 0 {
+		return
+	}
+	ticker := time.NewTicker(sm.cfg.Replication.PinReannounceInterval)
+	defer ticker.Stop()
+	const delayBetweenPins = 40 * time.Millisecond
+	for {
+		select {
+		case <-sm.ctx.Done():
+			return
+		case <-ticker.C:
+			manifests := sm.storageMgr.GetPinnedManifests()
+			if len(manifests) == 0 {
+				continue
+			}
+			announced := 0
+			for _, manifestCIDStr := range manifests {
+				if common.IsLegacyManifest(sm.ctx, sm.ipfsClient, manifestCIDStr) {
+					continue
+				}
+				payloadCIDStr, _ := common.GetPayloadCIDForShardAssignment(sm.ctx, sm.ipfsClient, manifestCIDStr)
+				if !sm.AmIResponsibleFor(payloadCIDStr) {
+					continue
+				}
+				sm.AnnouncePinned(manifestCIDStr)
+				announced++
+				select {
+				case <-sm.ctx.Done():
+					return
+				case <-time.After(delayBetweenPins):
+				}
+			}
+			if announced > 0 {
+				slog.Debug("re-announced pins on current shard", "announced", announced, "interval", sm.cfg.Replication.PinReannounceInterval)
+			}
+		}
+	}
+}
diff --git a/internal/managers/shard/shard_maintenance.go b/internal/managers/shard/shard_maintenance.go
deleted file mode 100644
index fe38315..0000000
--- a/internal/managers/shard/shard_maintenance.go
+++ /dev/null
@@ -1,264 +0,0 @@
-package shard
-
-import (
-	"log/slog"
-	"time"
-
-	"dlockss/internal/common"
-	"dlockss/pkg/schema"
-)
-
-const orphanUnpinInterval = 2 * time.Minute
-
-// RunOrphanUnpinPass unpins files that belong to active child shards (we are still in parent).
-func (sm *ShardManager) RunOrphanUnpinPass() {
-	sm.pruneOrphanHandoffSent()
-
-	sm.mu.RLock()
-	currentShard := sm.currentShard
-	sm.mu.RUnlock()
-
-	files := sm.storageMgr.GetAllKnownFiles()
-	if len(files) == 0 {
-		return
-	}
-
-	child0, child1 := childShards(currentShard)
-	probeTimeout := 4 * time.Second
-	n0 := sm.probeShard(child0, probeTimeout)
-	n1 := sm.probeShard(child1, probeTimeout)
-	if n0 < 1 && n1 < 1 {
-		return
-	}
-	activeChildren := make(map[string]struct{})
-	if n0 >= 1 {
-		activeChildren[child0] = struct{}{}
-	}
-	if n1 >= 1 {
-		activeChildren[child1] = struct{}{}
-	}
-
-	depth := len(currentShard) + 1
-	unpinned := 0
-	for key := range files {
-		if !sm.storageMgr.IsPinned(key) {
-			continue
-		}
-		if pinTime := sm.storageMgr.GetPinTime(key); !pinTime.IsZero() && time.Since(pinTime) < sm.cfg.OrphanUnpinGracePeriod {
-			continue
-		}
-		payloadCIDStr, _ := common.GetPayloadCIDForShardAssignment(sm.ctx, sm.ipfsClient, key)
-		stableHex := common.KeyToStableHex(payloadCIDStr)
-		targetChild, err := common.GetHexBinaryPrefix(stableHex, depth)
-		if err != nil {
-			continue
-		}
-		if _, active := activeChildren[targetChild]; !active {
-			continue
-		}
-		manifestCID, err := common.KeyToCID(key)
-		if err != nil {
-			continue
-		}
-		sm.mu.Lock()
-		var info *orphanHandoffInfo
-		if sm.orphanHandoffSent[key] != nil {
-			info = sm.orphanHandoffSent[key][targetChild]
-		}
-		sm.mu.Unlock()
-		if info != nil && time.Since(info.lastSent) < sm.cfg.OrphanHandoffGrace {
-			continue
-		}
-		minCount := sm.cfg.OrphanUnpinMinHandoffCount
-		if minCount < 1 {
-			minCount = 1
-		}
-		if info != nil && info.count >= minCount && time.Since(info.lastSent) >= sm.cfg.OrphanHandoffGrace {
-			// Proceed to unpin
-		} else if info == nil || info.count < minCount {
-			if sm.signer != nil {
-				rr := &schema.ReplicationRequest{
-					SignedEnvelope: schema.SignedEnvelope{Type: schema.MessageTypeReplicationRequest, ManifestCID: manifestCID},
-				}
-				if err := sm.signer.SignProtocolMessage(rr); err == nil {
-					if b, err := rr.MarshalCBOR(); err == nil && sm.JoinShardAsObserver(targetChild) {
-						sm.PublishToShardCBOR(b, targetChild)
-						sm.LeaveShardAsObserver(targetChild)
-						sm.mu.Lock()
-						if sm.orphanHandoffSent[key] == nil {
-							sm.orphanHandoffSent[key] = make(map[string]*orphanHandoffInfo)
-						}
-						if sm.orphanHandoffSent[key][targetChild] == nil {
-							sm.orphanHandoffSent[key][targetChild] = &orphanHandoffInfo{}
-						}
-						ho := sm.orphanHandoffSent[key][targetChild]
-						ho.lastSent = time.Now()
-						ho.count++
-						sm.mu.Unlock()
-						slog.Info("orphan handoff: sent ReplicationRequest to child", "child", targetChild, "manifest", key, "count", ho.count)
-						time.Sleep(10 * time.Millisecond)
-						continue
-					}
-				}
-			}
-			continue
-		}
-
-		slog.Info("orphan unpin", "manifest", key, "child", targetChild)
-		if err := sm.clusterMgr.Unpin(sm.ctx, currentShard, manifestCID); err != nil {
-			slog.Error("orphan unpin: cluster unpin failed", "manifest", key, "error", err)
-		}
-		if err := sm.ipfsClient.UnpinRecursive(sm.ctx, manifestCID); err != nil {
-			slog.Error("orphan unpin: IPFS unpin failed", "manifest", key, "error", err)
-		}
-		sm.storageMgr.UnpinFile(key)
-		sm.mu.Lock()
-		if sm.orphanHandoffSent[key] != nil {
-			delete(sm.orphanHandoffSent[key], targetChild)
-			if len(sm.orphanHandoffSent[key]) == 0 {
-				delete(sm.orphanHandoffSent, key)
-			}
-		}
-		sm.mu.Unlock()
-		unpinned++
-		time.Sleep(10 * time.Millisecond)
-	}
-	if unpinned > 0 {
-		slog.Info("orphan unpin pass complete", "unpinned", unpinned)
-	}
-}
-
-func (sm *ShardManager) pruneOrphanHandoffSent() {
-	sm.mu.Lock()
-	defer sm.mu.Unlock()
-	cutoff := time.Now().Add(-2 * sm.cfg.OrphanHandoffGrace)
-	for key, children := range sm.orphanHandoffSent {
-		for child, info := range children {
-			if info.lastSent.Before(cutoff) {
-				delete(children, child)
-			}
-		}
-		if len(children) == 0 {
-			delete(sm.orphanHandoffSent, key)
-		}
-	}
-}
-
-func (sm *ShardManager) runOrphanUnpinLoop() {
-	ticker := time.NewTicker(orphanUnpinInterval)
-	defer ticker.Stop()
-	for {
-		select {
-		case <-sm.ctx.Done():
-			return
-		case <-ticker.C:
-			sm.RunOrphanUnpinPass()
-		}
-	}
-}
-
-const legacyCleanupInterval = 5 * time.Minute
-
-// runLegacyManifestCleanup periodically scans pinned manifests and unpins any
-// that contain a legacy timestamp field (non-deterministic CIDs from the old format).
-func (sm *ShardManager) runLegacyManifestCleanup() {
-	select {
-	case <-sm.ctx.Done():
-		return
-	case <-time.After(30 * time.Second):
-	}
-	sm.cleanupLegacyManifests()
-
-	ticker := time.NewTicker(legacyCleanupInterval)
-	defer ticker.Stop()
-	for {
-		select {
-		case <-sm.ctx.Done():
-			return
-		case <-ticker.C:
-			sm.cleanupLegacyManifests()
-		}
-	}
-}
-
-func (sm *ShardManager) cleanupLegacyManifests() {
-	manifests := sm.storageMgr.GetPinnedManifests()
-	if len(manifests) == 0 {
-		return
-	}
-
-	sm.mu.RLock()
-	currentShard := sm.currentShard
-	sm.mu.RUnlock()
-
-	removed := 0
-	for _, manifestCIDStr := range manifests {
-		select {
-		case <-sm.ctx.Done():
-			return
-		default:
-		}
-		if !common.IsLegacyManifest(sm.ctx, sm.ipfsClient, manifestCIDStr) {
-			continue
-		}
-		manifestCID, err := common.KeyToCID(manifestCIDStr)
-		if err != nil {
-			continue
-		}
-		slog.Info("removing legacy manifest", "manifest", manifestCIDStr)
-		if currentShard != "" {
-			if err := sm.clusterMgr.Unpin(sm.ctx, currentShard, manifestCID); err != nil {
-				slog.Error("cluster unpin failed for legacy manifest", "manifest", manifestCIDStr, "error", err)
-			}
-		}
-		if err := sm.ipfsClient.UnpinRecursive(sm.ctx, manifestCID); err != nil {
-			slog.Error("IPFS unpin failed for legacy manifest", "manifest", manifestCIDStr, "error", err)
-		}
-		sm.storageMgr.UnpinFile(manifestCIDStr)
-		removed++
-		time.Sleep(50 * time.Millisecond)
-	}
-	if removed > 0 {
-		slog.Info("legacy manifest cleanup complete", "removed", removed)
-	}
-}
-
-func (sm *ShardManager) runReannouncePinsLoop() {
-	if sm.cfg.PinReannounceInterval <= 0 {
-		return
-	}
-	ticker := time.NewTicker(sm.cfg.PinReannounceInterval)
-	defer ticker.Stop()
-	const delayBetweenPins = 40 * time.Millisecond
-	for {
-		select {
-		case <-sm.ctx.Done():
-			return
-		case <-ticker.C:
-			manifests := sm.storageMgr.GetPinnedManifests()
-			if len(manifests) == 0 {
-				continue
-			}
-			announced := 0
-			for _, manifestCIDStr := range manifests {
-				if common.IsLegacyManifest(sm.ctx, sm.ipfsClient, manifestCIDStr) {
-					continue
-				}
-				payloadCIDStr, _ := common.GetPayloadCIDForShardAssignment(sm.ctx, sm.ipfsClient, manifestCIDStr)
-				if !sm.AmIResponsibleFor(payloadCIDStr) {
-					continue
-				}
-				sm.AnnouncePinned(manifestCIDStr)
-				announced++
-				select {
-				case <-sm.ctx.Done():
-					return
-				case <-time.After(delayBetweenPins):
-				}
-			}
-			if announced > 0 {
-				slog.Debug("re-announced pins on current shard", "announced", announced, "interval", sm.cfg.PinReannounceInterval)
-			}
-		}
-	}
-}
diff --git a/internal/managers/shard/shard_maintenance_test.go b/internal/managers/shard/shard_maintenance_test.go
index 5cf05f2..f7a369b 100644
--- a/internal/managers/shard/shard_maintenance_test.go
+++ b/internal/managers/shard/shard_maintenance_test.go
@@ -17,11 +17,11 @@ func TestPruneOrphanHandoffSent(t *testing.T) {
 
 	cfg := config.DefaultConfig()
 	sm.cfg = cfg
-	cfg.OrphanHandoffGrace = 100 * time.Millisecond
+	cfg.Orphan.HandoffGrace = 100 * time.Millisecond
 
 	now := time.Now()
-	old := now.Add(-5 * cfg.OrphanHandoffGrace)
-	recent := now.Add(-cfg.OrphanHandoffGrace / 2)
+	old := now.Add(-5 * cfg.Orphan.HandoffGrace)
+	recent := now.Add(-cfg.Orphan.HandoffGrace / 2)
 
 	sm.orphanHandoffSent = map[string]map[string]*orphanHandoffInfo{
 		"old-manifest": {
@@ -131,7 +131,7 @@ func TestReshardedFilesMarking(t *testing.T) {
 	if !kf.Has("test") {
 		t.Error("should have key after add")
 	}
-	if kf.Size() != 1 {
-		t.Errorf("expected size 1, got %d", kf.Size())
+	if len(kf.All()) != 1 {
+		t.Errorf("expected size 1, got %d", len(kf.All()))
 	}
 }
diff --git a/internal/managers/shard/shard_msg_handlers.go b/internal/managers/shard/shard_msg_handlers.go
index ba4bef2..9f263b2 100644
--- a/internal/managers/shard/shard_msg_handlers.go
+++ b/internal/managers/shard/shard_msg_handlers.go
@@ -13,65 +13,6 @@ import (
 	"dlockss/pkg/schema"
 )
 
-// handleReplicationRequest verifies, then fetches and pins if not already pinned.
-func (sm *ShardManager) handleReplicationRequest(msg *pubsub.Message, rr *schema.ReplicationRequest, shardID string) {
-	if sm.signer == nil {
-		return
-	}
-	logPrefix := fmt.Sprintf("ReplicationRequest (Shard %s)", shardID)
-	if sm.signer.ShouldDropMessage(msg.GetFrom(), rr.SenderID, rr.Timestamp, rr.Nonce, rr.Sig, rr.MarshalCBORForSigning, logPrefix) {
-		slog.Warn("ReplicationRequest rejected", "manifest", rr.ManifestCID.String(), "from", msg.GetFrom().String(), "shard", shardID)
-		return
-	}
-	manifestCIDStr := rr.ManifestCID.String()
-	c := rr.ManifestCID
-
-	checkCtx, checkCancel := context.WithTimeout(sm.ctx, 5*time.Second)
-	legacy := common.IsLegacyManifest(checkCtx, sm.ipfsClient, manifestCIDStr)
-	checkCancel()
-	if legacy {
-		slog.Info("ignoring legacy manifest in ReplicationRequest", "manifest", manifestCIDStr)
-		return
-	}
-
-	if sm.storageMgr.IsPinned(manifestCIDStr) {
-		if err := sm.EnsureClusterForShard(sm.ctx, shardID); err != nil {
-			slog.Error("ReplicationRequest: failed to ensure cluster for shard", "shard", shardID, "error", err)
-			return
-		}
-		sm.clusterMgr.TriggerSync(shardID)
-		return
-	}
-	if !sm.cfg.AutoReplicationEnabled {
-		return
-	}
-	select {
-	case sm.autoReplicationSem <- struct{}{}:
-	default:
-		slog.Debug("auto-replication skipped, concurrency limit reached", "manifest", manifestCIDStr)
-		return
-	}
-	go func() {
-		defer func() { <-sm.autoReplicationSem }()
-		fetchCtx, cancelFetch := context.WithTimeout(sm.ctx, sm.cfg.AutoReplicationTimeout)
-		if err := sm.ipfsClient.PinRecursive(fetchCtx, c); err != nil {
-			cancelFetch()
-			slog.Error("auto-replication: failed to fetch/pin", "manifest", manifestCIDStr, "error", err)
-			return
-		}
-		cancelFetch()
-		if err := sm.EnsureClusterForShard(sm.ctx, shardID); err != nil {
-			slog.Error("auto-replication: failed to ensure cluster for shard", "shard", shardID, "error", err)
-			return
-		}
-		if err := sm.clusterMgr.PinIfAbsent(sm.ctx, shardID, c, -1, -1); err != nil {
-			slog.Error("auto-replication: failed to write CRDT pin", "manifest", manifestCIDStr, "error", err)
-		}
-		sm.clusterMgr.TriggerSync(shardID)
-		slog.Info("auto-replication: fetched and pinned", "manifest", manifestCIDStr, "shard", shardID)
-	}()
-}
-
 // isAuthorizedIngestor returns true if the peer is allowed to publish ingest
 // messages. When the allowlist is empty the topic is open to all.
 func (sm *ShardManager) isAuthorizedIngestor(senderID peer.ID) bool {
@@ -103,7 +44,6 @@ func (sm *ShardManager) handleIngestMessage(msg *pubsub.Message, im *schema.Inge
 		return
 	}
 	key := im.ManifestCID.String()
-	sm.metrics.IncrementMessagesReceived()
 
 	checkCtx, checkCancel := context.WithTimeout(sm.ctx, 5*time.Second)
 	legacy := common.IsLegacyManifest(checkCtx, sm.ipfsClient, key)
diff --git a/internal/managers/shard/shard_orphan.go b/internal/managers/shard/shard_orphan.go
new file mode 100644
index 0000000..afad7e4
--- /dev/null
+++ b/internal/managers/shard/shard_orphan.go
@@ -0,0 +1,183 @@
+package shard
+
+import (
+	"log/slog"
+	"time"
+
+	"github.com/ipfs/go-cid"
+
+	"dlockss/internal/common"
+	"dlockss/pkg/schema"
+)
+
+const orphanUnpinInterval = 2 * time.Minute
+
+func (sm *ShardManager) RunOrphanUnpinPass() {
+	sm.pruneOrphanHandoffSent()
+
+	sm.mu.RLock()
+	currentShard := sm.currentShard
+	sm.mu.RUnlock()
+
+	files := sm.storageMgr.GetAllKnownFiles()
+	if len(files) == 0 {
+		return
+	}
+
+	activeChildren := sm.collectActiveChildShards(currentShard)
+	if len(activeChildren) == 0 {
+		return
+	}
+
+	depth := len(currentShard) + 1
+	unpinned := 0
+	for key := range files {
+		if !sm.storageMgr.IsPinned(key) {
+			continue
+		}
+		if pinTime := sm.storageMgr.GetPinTime(key); !pinTime.IsZero() && time.Since(pinTime) < sm.cfg.Orphan.UnpinGracePeriod {
+			continue
+		}
+		payloadCIDStr, _ := common.GetPayloadCIDForShardAssignment(sm.ctx, sm.ipfsClient, key)
+		stableHex := common.KeyToStableHex(payloadCIDStr)
+		targetChild, err := common.GetHexBinaryPrefix(stableHex, depth)
+		if err != nil {
+			continue
+		}
+		if _, active := activeChildren[targetChild]; !active {
+			continue
+		}
+		manifestCID, err := cid.Decode(key)
+		if err != nil {
+			continue
+		}
+		if sm.orphanHandoffOrUnpin(key, targetChild, currentShard, manifestCID) {
+			unpinned++
+		}
+		time.Sleep(10 * time.Millisecond)
+	}
+	if unpinned > 0 {
+		slog.Info("orphan unpin pass complete", "unpinned", unpinned)
+	}
+}
+
+func (sm *ShardManager) collectActiveChildShards(currentShard string) map[string]struct{} {
+	child0, child1 := childShards(currentShard)
+	probeTimeout := 4 * time.Second
+	n0 := sm.probeShard(child0, probeTimeout)
+	n1 := sm.probeShard(child1, probeTimeout)
+
+	active := make(map[string]struct{})
+	if n0 >= 1 {
+		active[child0] = struct{}{}
+	}
+	if n1 >= 1 {
+		active[child1] = struct{}{}
+	}
+	return active
+}
+
+func (sm *ShardManager) orphanHandoffOrUnpin(key, targetChild, currentShard string, manifestCID cid.Cid) bool {
+	sm.mu.Lock()
+	var info *orphanHandoffInfo
+	if sm.orphanHandoffSent[key] != nil {
+		info = sm.orphanHandoffSent[key][targetChild]
+	}
+	sm.mu.Unlock()
+
+	if info != nil && time.Since(info.lastSent) < sm.cfg.Orphan.HandoffGrace {
+		return false
+	}
+
+	minCount := sm.cfg.Orphan.UnpinMinHandoffCnt
+	if minCount < 1 {
+		minCount = 1
+	}
+
+	readyToUnpin := info != nil && info.count >= minCount && time.Since(info.lastSent) >= sm.cfg.Orphan.HandoffGrace
+	if !readyToUnpin {
+		sm.sendOrphanHandoff(key, targetChild, manifestCID)
+		return false
+	}
+
+	slog.Info("orphan unpin", "manifest", key, "child", targetChild)
+	if err := sm.clusterMgr.Unpin(sm.ctx, currentShard, manifestCID); err != nil {
+		slog.Error("orphan unpin: cluster unpin failed", "manifest", key, "error", err)
+	}
+	if err := sm.ipfsClient.UnpinRecursive(sm.ctx, manifestCID); err != nil {
+		slog.Error("orphan unpin: IPFS unpin failed", "manifest", key, "error", err)
+	}
+	sm.storageMgr.UnpinFile(key)
+	sm.mu.Lock()
+	if sm.orphanHandoffSent[key] != nil {
+		delete(sm.orphanHandoffSent[key], targetChild)
+		if len(sm.orphanHandoffSent[key]) == 0 {
+			delete(sm.orphanHandoffSent, key)
+		}
+	}
+	sm.mu.Unlock()
+	return true
+}
+
+func (sm *ShardManager) sendOrphanHandoff(key, targetChild string, manifestCID cid.Cid) {
+	if sm.signer == nil {
+		return
+	}
+	rr := &schema.ReplicationRequest{
+		SignedEnvelope: schema.SignedEnvelope{Type: schema.MessageTypeReplicationRequest, ManifestCID: manifestCID},
+	}
+	if err := sm.signer.SignProtocolMessage(rr); err != nil {
+		return
+	}
+	b, err := rr.MarshalCBOR()
+	if err != nil {
+		return
+	}
+	if !sm.JoinShardAsObserver(targetChild) {
+		return
+	}
+	sm.PublishToShardCBOR(b, targetChild)
+	sm.LeaveShardAsObserver(targetChild)
+
+	sm.mu.Lock()
+	if sm.orphanHandoffSent[key] == nil {
+		sm.orphanHandoffSent[key] = make(map[string]*orphanHandoffInfo)
+	}
+	if sm.orphanHandoffSent[key][targetChild] == nil {
+		sm.orphanHandoffSent[key][targetChild] = &orphanHandoffInfo{}
+	}
+	ho := sm.orphanHandoffSent[key][targetChild]
+	ho.lastSent = time.Now()
+	ho.count++
+	sm.mu.Unlock()
+	slog.Info("orphan handoff: sent ReplicationRequest to child", "child", targetChild, "manifest", key, "count", ho.count)
+}
+
+func (sm *ShardManager) pruneOrphanHandoffSent() {
+	sm.mu.Lock()
+	defer sm.mu.Unlock()
+	cutoff := time.Now().Add(-2 * sm.cfg.Orphan.HandoffGrace)
+	for key, children := range sm.orphanHandoffSent {
+		for child, info := range children {
+			if info.lastSent.Before(cutoff) {
+				delete(children, child)
+			}
+		}
+		if len(children) == 0 {
+			delete(sm.orphanHandoffSent, key)
+		}
+	}
+}
+
+func (sm *ShardManager) runOrphanUnpinLoop() {
+	ticker := time.NewTicker(orphanUnpinInterval)
+	defer ticker.Stop()
+	for {
+		select {
+		case <-sm.ctx.Done():
+			return
+		case <-ticker.C:
+			sm.RunOrphanUnpinPass()
+		}
+	}
+}
diff --git a/internal/managers/shard/shard_peers.go b/internal/managers/shard/shard_peers.go
index 73fbcd0..b467fa0 100644
--- a/internal/managers/shard/shard_peers.go
+++ b/internal/managers/shard/shard_peers.go
@@ -4,51 +4,30 @@ import (
 	"context"
 	"time"
 
-	"github.com/libp2p/go-libp2p/core/host"
 	"github.com/libp2p/go-libp2p/core/peer"
 )
 
-func (sm *ShardManager) getShardPeerCount() int {
+// getShardPeerCount returns the number of active peers in the current shard.
+// When useMeshFallback is true, falls back to the mesh peer list if no
+// role-based counts are available. Split decisions should pass false to avoid
+// counting non-ACTIVE subscribers (e.g. the monitor).
+func (sm *ShardManager) getShardPeerCount(useMeshFallback bool) int {
 	sm.mu.RLock()
 	currentShard := sm.currentShard
 	sub, exists := sm.shardSubs[currentShard]
 	sm.mu.RUnlock()
 
 	if exists && sub.topic != nil {
-		activeCount := sm.peers.CountActive(currentShard, true, currentShard, sm.cfg.SeenPeersWindow)
+		activeCount := sm.peers.CountActive(currentShard, true, currentShard, sm.cfg.Sharding.SeenPeersWindow)
 		if activeCount > 0 {
 			return activeCount
 		}
-		meshPeers := sub.topic.ListPeers()
-		return len(meshPeers) + 1
-	}
-
-	if sm.clusterMgr != nil {
-		count, err := sm.clusterMgr.GetPeerCount(sm.ctx, currentShard)
-		if err == nil {
-			return count
-		}
-	}
-	return 0
-}
-
-// getShardPeerCountForSplit returns ACTIVE peer count for split decisions.
-// Uses only role-based counts (HEARTBEAT/JOIN); avoids mesh fallback because the mesh
-// can include the monitor and other non-ACTIVE subscribers, which would overcount
-// and trigger premature splits (e.g. 9 real nodes + monitor = 10, split when we shouldn't).
-func (sm *ShardManager) getShardPeerCountForSplit() int {
-	sm.mu.RLock()
-	currentShard := sm.currentShard
-	sub, exists := sm.shardSubs[currentShard]
-	sm.mu.RUnlock()
-
-	if exists && sub.topic != nil {
-		activeCount := sm.peers.CountActive(currentShard, true, currentShard, sm.cfg.SeenPeersWindow)
-		if activeCount > 0 {
-			return activeCount
+		if useMeshFallback {
+			return len(sub.topic.ListPeers()) + 1
 		}
 		return 0
 	}
+
 	if sm.clusterMgr != nil {
 		count, err := sm.clusterMgr.GetPeerCount(sm.ctx, currentShard)
 		if err == nil {
@@ -58,15 +37,11 @@ func (sm *ShardManager) getShardPeerCountForSplit() int {
 	return 0
 }
 
-func (sm *ShardManager) GetShardInfo() (string, int) {
+func (sm *ShardManager) GetShardInfo() string {
 	sm.mu.RLock()
 	currentShard := sm.currentShard
 	sm.mu.RUnlock()
-	return currentShard, sm.getShardPeerCount()
-}
-
-func (sm *ShardManager) GetHost() host.Host {
-	return sm.h
+	return currentShard
 }
 
 // PeerID returns the local peer's ID.
@@ -74,18 +49,6 @@ func (sm *ShardManager) PeerID() peer.ID {
 	return sm.h.ID()
 }
 
-func (sm *ShardManager) GetShardPeers() []peer.ID {
-	sm.mu.RLock()
-	currentShard := sm.currentShard
-	sub, exists := sm.shardSubs[currentShard]
-	sm.mu.RUnlock()
-
-	if !exists || sub.topic == nil {
-		return nil
-	}
-	return sub.topic.ListPeers()
-}
-
 func (sm *ShardManager) GetPeersForShard(shardID string) []peer.ID {
 	sm.mu.RLock()
 	sub, exists := sm.shardSubs[shardID]
@@ -96,10 +59,9 @@ func (sm *ShardManager) GetPeersForShard(shardID string) []peer.ID {
 	}
 
 	if sm.peers.HasRoles(shardID) {
-		return sm.peers.GetActiveForShard(shardID, sm.cfg.SeenPeersWindow)
+		return sm.peers.GetActiveForShard(shardID, sm.cfg.Sharding.SeenPeersWindow)
 	}
 
-	// Fallback: no role data, use mesh+seen (may include PASSIVE/PROBE)
 	meshPeers := sub.topic.ListPeers()
 	seen := make(map[peer.ID]struct{}, len(meshPeers))
 	for _, p := range meshPeers {
@@ -107,7 +69,7 @@ func (sm *ShardManager) GetPeersForShard(shardID string) []peer.ID {
 			seen[p] = struct{}{}
 		}
 	}
-	for p := range sm.peers.GetSeenPeers(shardID, sm.cfg.SeenPeersWindow) {
+	for p := range sm.peers.GetSeenPeers(shardID, sm.cfg.Sharding.SeenPeersWindow) {
 		seen[p] = struct{}{}
 	}
 	all := make([]peer.ID, 0, len(seen))
@@ -117,28 +79,6 @@ func (sm *ShardManager) GetPeersForShard(shardID string) []peer.ID {
 	return all
 }
 
-func (sm *ShardManager) GetShardPeerCount(shardID string) int {
-	sm.mu.RLock()
-	currentShard := sm.currentShard
-	sub, exists := sm.shardSubs[shardID]
-	sm.mu.RUnlock()
-
-	if !exists || sub.topic == nil {
-		return 0
-	}
-	includeSelf := (shardID == currentShard)
-	activeCount := sm.peers.CountActive(shardID, includeSelf, currentShard, sm.cfg.SeenPeersWindow)
-	if activeCount > 0 {
-		return activeCount
-	}
-	meshPeers := sub.topic.ListPeers()
-	n := len(meshPeers)
-	if includeSelf {
-		n++
-	}
-	return n
-}
-
 func getSiblingShard(shardID string) string {
 	if shardID == "" {
 		return ""
@@ -149,40 +89,13 @@ func getSiblingShard(shardID string) string {
 	return parent + string(byte(otherBit))
 }
 
-func (sm *ShardManager) generateDeeperShards(currentShard string, maxDepth int) []string {
-	if maxDepth <= 0 {
-		return nil
-	}
-
-	var shards []string
-	queue := []string{currentShard}
-	maxShardLength := len(currentShard) + maxDepth
-
-	for len(queue) > 0 {
-		shard := queue[0]
-		queue = queue[1:]
-
-		child0 := shard + "0"
-		child1 := shard + "1"
-
-		if len(child0) <= maxShardLength {
-			shards = append(shards, child0, child1)
-			if len(child0) < maxShardLength {
-				queue = append(queue, child0, child1)
-			}
-		}
-	}
-
-	return shards
-}
-
 func (sm *ShardManager) probeShard(shardID string, probeTimeout time.Duration) int {
 	sm.mu.RLock()
 	sub, alreadyJoined := sm.shardSubs[shardID]
 	sm.mu.RUnlock()
 
 	if alreadyJoined && sub.topic != nil {
-		return sm.getProbePeerCount(shardID, sm.cfg.SeenPeersWindow)
+		return sm.getProbePeerCount(shardID, sm.cfg.Sharding.SeenPeersWindow)
 	}
 	return sm.probeShardSilently(shardID, probeTimeout)
 }
@@ -235,7 +148,7 @@ func (sm *ShardManager) probeShardSilently(shardID string, probeTimeout time.Dur
 		sm.processTextProtocolForProbe(msg, shardID)
 	}
 
-	activeCount := sm.peers.CountActive(shardID, false, "", sm.cfg.SeenPeersWindow)
+	activeCount := sm.peers.CountActive(shardID, false, "", sm.cfg.Sharding.SeenPeersWindow)
 	if activeCount > 0 {
 		sm.mu.Lock()
 		if old := sm.probeTopicCache[shardID]; old != nil && old != t {
diff --git a/internal/managers/shard/shard_publish.go b/internal/managers/shard/shard_publish.go
index 2299502..db81b4d 100644
--- a/internal/managers/shard/shard_publish.go
+++ b/internal/managers/shard/shard_publish.go
@@ -25,22 +25,6 @@ func (sm *ShardManager) AnnouncePinned(manifestCID string) {
 	_ = sub.topic.Publish(sm.ctx, msg)
 }
 
-func (sm *ShardManager) PublishToShard(shardID, msg string) {
-	sm.mu.RLock()
-	sub, exists := sm.shardSubs[shardID]
-	sm.mu.RUnlock()
-
-	if !exists {
-		return
-	}
-	if sub.topic != nil {
-		_ = sub.topic.Publish(sm.ctx, []byte(msg))
-	} else {
-		topicName := sm.shardTopicName(shardID)
-		_ = sm.ps.Publish(topicName, []byte(msg))
-	}
-}
-
 func (sm *ShardManager) PublishToShardCBOR(data []byte, shardID string) {
 	sm.mu.RLock()
 	sub, exists := sm.shardSubs[shardID]
@@ -127,7 +111,7 @@ func (sm *ShardManager) PinToCluster(ctx context.Context, c cid.Cid) error {
 }
 
 func (sm *ShardManager) EnsureClusterForShard(ctx context.Context, shardID string) error {
-	return sm.clusterMgr.JoinShard(ctx, shardID, nil)
+	return sm.clusterMgr.JoinShard(ctx, shardID)
 }
 
 func (sm *ShardManager) PinToShard(ctx context.Context, shardID string, c cid.Cid) error {
diff --git a/internal/managers/shard/shard_replication.go b/internal/managers/shard/shard_replication.go
new file mode 100644
index 0000000..3cf41ed
--- /dev/null
+++ b/internal/managers/shard/shard_replication.go
@@ -0,0 +1,222 @@
+package shard
+
+import (
+	"context"
+	"fmt"
+	"log/slog"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"github.com/ipfs/go-cid"
+	pubsub "github.com/libp2p/go-libp2p-pubsub"
+
+	"dlockss/internal/config"
+	"dlockss/pkg/schema"
+)
+
+// replicationOps is the narrow interface the replicationManager uses to
+// interact with the rest of the shard package.
+type replicationOps interface {
+	replicationContext() context.Context
+	replicationConfig() *config.Config
+	getCurrentShard() string
+	getPinnedManifests() []string
+	isPinned(key string) bool
+	isLegacyManifest(cidStr string) bool
+	publishCBOR(data []byte, shardID string)
+	ensureCluster(ctx context.Context, shardID string) error
+	clusterPinIfAbsent(ctx context.Context, shardID string, c cid.Cid) error
+	clusterTriggerSync(shardID string)
+	ipfsPinRecursive(ctx context.Context, c cid.Cid) error
+	replicationSigner() MessageAuthenticator
+}
+
+type replicationManager struct {
+	ops replicationOps
+
+	mu       sync.Mutex
+	cooldown map[string]time.Time
+	sem      chan struct{}
+}
+
+func newReplicationManager(ops replicationOps, maxConcurrent int) *replicationManager {
+	if maxConcurrent < 1 {
+		maxConcurrent = 1
+	}
+	return &replicationManager{
+		ops:      ops,
+		cooldown: make(map[string]time.Time),
+		sem:      make(chan struct{}, maxConcurrent),
+	}
+}
+
+func (rm *replicationManager) pruneCooldown() {
+	rm.mu.Lock()
+	defer rm.mu.Unlock()
+	cutoff := time.Now().Add(-2 * replicationRequestCooldownDuration)
+	for cidStr, lastSent := range rm.cooldown {
+		if lastSent.Before(cutoff) {
+			delete(rm.cooldown, cidStr)
+		}
+	}
+}
+
+func (rm *replicationManager) runChecker() {
+	cfg := rm.ops.replicationConfig()
+	if cfg.Replication.CheckInterval <= 0 {
+		return
+	}
+	ctx := rm.ops.replicationContext()
+	ticker := time.NewTicker(rootReplicationCheckInterval)
+	defer ticker.Stop()
+
+	var lastCheck time.Time
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+			currentShard := rm.ops.getCurrentShard()
+
+			interval := cfg.Replication.CheckInterval
+			if currentShard == "" {
+				interval = rootReplicationCheckInterval
+			}
+			if time.Since(lastCheck) < interval {
+				continue
+			}
+			lastCheck = time.Now()
+
+			manifests := rm.ops.getPinnedManifests()
+			if len(manifests) == 0 {
+				continue
+			}
+
+			rm.pruneCooldown()
+			rm.sendReplicationRequests(ctx, cfg, currentShard, manifests)
+		}
+	}
+}
+
+func (rm *replicationManager) sendReplicationRequests(ctx context.Context, cfg *config.Config, currentShard string, manifests []string) {
+	maxConc := cfg.Replication.MaxConcurrentReplicationChecks
+	if maxConc < 1 {
+		maxConc = 1
+	}
+	sem := make(chan struct{}, maxConc)
+	var wg sync.WaitGroup
+	var sentThisCycle int32
+
+	for _, manifestCIDStr := range manifests {
+		select {
+		case <-ctx.Done():
+			wg.Wait()
+			return
+		case sem <- struct{}{}:
+		}
+		if atomic.LoadInt32(&sentThisCycle) >= maxReplicationRequestsPerCycle {
+			<-sem
+			continue
+		}
+		wg.Add(1)
+		go func(manifestCIDStr string) {
+			defer wg.Done()
+			defer func() { <-sem }()
+			c, err := cid.Decode(manifestCIDStr)
+			if err != nil {
+				return
+			}
+			if atomic.LoadInt32(&sentThisCycle) >= maxReplicationRequestsPerCycle {
+				return
+			}
+			rm.mu.Lock()
+			lastSent := rm.cooldown[manifestCIDStr]
+			if time.Since(lastSent) < replicationRequestCooldownDuration {
+				rm.mu.Unlock()
+				return
+			}
+			rm.cooldown[manifestCIDStr] = time.Now()
+			rm.mu.Unlock()
+
+			signer := rm.ops.replicationSigner()
+			if signer == nil {
+				return
+			}
+			rr := &schema.ReplicationRequest{
+				SignedEnvelope: schema.SignedEnvelope{Type: schema.MessageTypeReplicationRequest, ManifestCID: c},
+			}
+			if err := signer.SignProtocolMessage(rr); err != nil {
+				slog.Error("failed to sign ReplicationRequest", "manifest", manifestCIDStr, "error", err)
+				return
+			}
+			b, err := rr.MarshalCBOR()
+			if err != nil {
+				return
+			}
+			rm.ops.publishCBOR(b, currentShard)
+			atomic.AddInt32(&sentThisCycle, 1)
+			slog.Debug("ReplicationRequest sent", "manifest", manifestCIDStr, "shard", currentShard)
+		}(manifestCIDStr)
+	}
+	wg.Wait()
+}
+
+func (rm *replicationManager) handleRequest(msg *pubsub.Message, rr *schema.ReplicationRequest, shardID string) {
+	signer := rm.ops.replicationSigner()
+	if signer == nil {
+		return
+	}
+	ctx := rm.ops.replicationContext()
+	cfg := rm.ops.replicationConfig()
+
+	logPrefix := fmt.Sprintf("ReplicationRequest (Shard %s)", shardID)
+	if signer.ShouldDropMessage(msg.GetFrom(), rr.SenderID, rr.Timestamp, rr.Nonce, rr.Sig, rr.MarshalCBORForSigning, logPrefix) {
+		slog.Warn("ReplicationRequest rejected", "manifest", rr.ManifestCID.String(), "from", msg.GetFrom().String(), "shard", shardID)
+		return
+	}
+	manifestCIDStr := rr.ManifestCID.String()
+	c := rr.ManifestCID
+
+	if rm.ops.isLegacyManifest(manifestCIDStr) {
+		slog.Info("ignoring legacy manifest in ReplicationRequest", "manifest", manifestCIDStr)
+		return
+	}
+
+	if rm.ops.isPinned(manifestCIDStr) {
+		if err := rm.ops.ensureCluster(ctx, shardID); err != nil {
+			slog.Error("ReplicationRequest: failed to ensure cluster for shard", "shard", shardID, "error", err)
+			return
+		}
+		rm.ops.clusterTriggerSync(shardID)
+		return
+	}
+	if !cfg.Replication.AutoReplicationEnabled {
+		return
+	}
+	select {
+	case rm.sem <- struct{}{}:
+	default:
+		slog.Debug("auto-replication skipped, concurrency limit reached", "manifest", manifestCIDStr)
+		return
+	}
+	go func() {
+		defer func() { <-rm.sem }()
+		fetchCtx, cancelFetch := context.WithTimeout(ctx, cfg.Replication.AutoReplicationTimeout)
+		if err := rm.ops.ipfsPinRecursive(fetchCtx, c); err != nil {
+			cancelFetch()
+			slog.Error("auto-replication: failed to fetch/pin", "manifest", manifestCIDStr, "error", err)
+			return
+		}
+		cancelFetch()
+		if err := rm.ops.ensureCluster(ctx, shardID); err != nil {
+			slog.Error("auto-replication: failed to ensure cluster for shard", "shard", shardID, "error", err)
+			return
+		}
+		if err := rm.ops.clusterPinIfAbsent(ctx, shardID, c); err != nil {
+			slog.Error("auto-replication: failed to write CRDT pin", "manifest", manifestCIDStr, "error", err)
+		}
+		rm.ops.clusterTriggerSync(shardID)
+		slog.Info("auto-replication: fetched and pinned", "manifest", manifestCIDStr, "shard", shardID)
+	}()
+}
diff --git a/internal/managers/shard/shard_reshard.go b/internal/managers/shard/shard_reshard.go
index 2963ba4..52434ae 100644
--- a/internal/managers/shard/shard_reshard.go
+++ b/internal/managers/shard/shard_reshard.go
@@ -4,6 +4,8 @@ import (
 	"log/slog"
 	"time"
 
+	"github.com/ipfs/go-cid"
+
 	"dlockss/internal/common"
 	"dlockss/pkg/schema"
 )
@@ -49,7 +51,7 @@ func (sm *ShardManager) RunReshardPass(oldShard, newShard string) {
 			continue
 		}
 
-		manifestCID, err := common.KeyToCID(key)
+		manifestCID, err := cid.Decode(key)
 		if err != nil {
 			continue
 		}
@@ -77,7 +79,7 @@ func (sm *ShardManager) RunReshardPass(oldShard, newShard string) {
 							slog.Info("reshard: ReplicationRequest sent before unpinning", "target_shard", targetNew, "manifest", key)
 							select {
 							case <-sm.ctx.Done():
-							case <-time.After(sm.cfg.ReshardHandoffDelay):
+							case <-time.After(sm.cfg.Files.ReshardHandoffDelay):
 							}
 						}
 					}
diff --git a/internal/managers/shard/shard_reshard_test.go b/internal/managers/shard/shard_reshard_test.go
index 396b072..45fa4fd 100644
--- a/internal/managers/shard/shard_reshard_test.go
+++ b/internal/managers/shard/shard_reshard_test.go
@@ -9,7 +9,6 @@ import (
 	"github.com/ipfs/go-cid"
 	pubsub "github.com/libp2p/go-libp2p-pubsub"
 	"github.com/libp2p/go-libp2p/core/peer"
-	"github.com/multiformats/go-multiaddr"
 
 	"dlockss/internal/common"
 	"dlockss/internal/config"
@@ -93,7 +92,7 @@ type recordingCluster struct {
 	synced   []string
 }
 
-func (c *recordingCluster) JoinShard(_ context.Context, shardID string, _ []multiaddr.Multiaddr) error {
+func (c *recordingCluster) JoinShard(_ context.Context, shardID string) error {
 	c.mu.Lock()
 	defer c.mu.Unlock()
 	c.joinedSh = append(c.joinedSh, shardID)
@@ -132,21 +131,21 @@ func (c *recordingCluster) TriggerSync(shardID string) {
 // buildTestSM creates a minimal ShardManager for testing without requiring pubsub.
 func buildTestSM(ctx context.Context, storage *recordingStorage, cluster *recordingCluster) *ShardManager {
 	cfg := config.DefaultConfig()
-	cfg.ReshardHandoffDelay = 0
-	return &ShardManager{
-		ctx:                        ctx,
-		cfg:                        cfg,
-		ipfsClient:                 &testutil.MockIPFSClient{},
-		storageMgr:                 storage,
-		clusterMgr:                 cluster,
-		reshardedFiles:             common.NewKnownFiles(),
-		shardSubs:                  make(map[string]*shardSubscription),
-		probeTopicCache:            make(map[string]*pubsub.Topic),
-		observerOnlyShards:         make(map[string]struct{}),
-		orphanHandoffSent:          make(map[string]map[string]*orphanHandoffInfo),
-		replicationRequestLastSent: make(map[string]time.Time),
-		autoReplicationSem:         make(chan struct{}, 1),
+	cfg.Files.ReshardHandoffDelay = 0
+	sm := &ShardManager{
+		ctx:                ctx,
+		cfg:                cfg,
+		ipfsClient:         &testutil.MockIPFSClient{},
+		storageMgr:         storage,
+		clusterMgr:         cluster,
+		reshardedFiles:     common.NewKnownFiles(),
+		shardSubs:          make(map[string]*shardSubscription),
+		probeTopicCache:    make(map[string]*pubsub.Topic),
+		observerOnlyShards: make(map[string]struct{}),
+		orphanHandoffSent:  make(map[string]map[string]*orphanHandoffInfo),
 	}
+	sm.repl = newReplicationManager(sm, 1)
+	return sm
 }
 
 // --- RunReshardPass tests ---
diff --git a/internal/managers/shard/shard_roles.go b/internal/managers/shard/shard_roles.go
index 54bd225..7a2a432 100644
--- a/internal/managers/shard/shard_roles.go
+++ b/internal/managers/shard/shard_roles.go
@@ -10,41 +10,36 @@ import (
 	"github.com/libp2p/go-libp2p/core/peer"
 )
 
-// PeerRole indicates whether a peer is actively contributing to replication.
-type PeerRole string
+type peerRole string
 
 const (
-	RoleActive     PeerRole = "ACTIVE"     // Ingestor node with storage capacity
-	RolePassive    PeerRole = "PASSIVE"    // At storage limit, cannot pin; not counted for replication
-	RoleProbe      PeerRole = "PROBE"      // Transient viewer, not counted
-	RoleReplicator PeerRole = "REPLICATOR" // Can replicate but not ingest new files
+	roleActive     peerRole = "ACTIVE"
+	rolePassive    peerRole = "PASSIVE"
+	roleProbe      peerRole = "PROBE"
+	roleReplicator peerRole = "REPLICATOR"
 )
 
-// PeerRoleInfo holds a peer's role and last-seen time.
-type PeerRoleInfo struct {
-	Role     PeerRole
-	LastSeen time.Time
+type peerRoleInfo struct {
+	role     peerRole
+	lastSeen time.Time
 }
 
-// PeerTracker tracks which peers are present in each shard and their roles.
-// Thread-safe with its own mutex, independent of ShardManager.mu.
-type PeerTracker struct {
+type peerTracker struct {
 	mu     sync.RWMutex
 	selfID peer.ID
 	seen   map[string]map[peer.ID]time.Time    // shard → peer → lastSeen
-	roles  map[string]map[peer.ID]PeerRoleInfo // shard → peer → role
+	roles  map[string]map[peer.ID]peerRoleInfo // shard → peer → role
 }
 
-func NewPeerTracker(selfID peer.ID) *PeerTracker {
-	return &PeerTracker{
+func newPeerTracker(selfID peer.ID) *peerTracker {
+	return &peerTracker{
 		selfID: selfID,
 		seen:   make(map[string]map[peer.ID]time.Time),
-		roles:  make(map[string]map[peer.ID]PeerRoleInfo),
+		roles:  make(map[string]map[peer.ID]peerRoleInfo),
 	}
 }
 
-// RecordSeen marks a peer as seen in a shard.
-func (pt *PeerTracker) RecordSeen(shardID string, peerID peer.ID) {
+func (pt *peerTracker) RecordSeen(shardID string, peerID peer.ID) {
 	pt.mu.Lock()
 	if pt.seen[shardID] == nil {
 		pt.seen[shardID] = make(map[peer.ID]time.Time)
@@ -53,18 +48,16 @@ func (pt *PeerTracker) RecordSeen(shardID string, peerID peer.ID) {
 	pt.mu.Unlock()
 }
 
-// RecordRole records a peer's role in a shard.
-func (pt *PeerTracker) RecordRole(shardID string, peerID peer.ID, role PeerRole) {
+func (pt *peerTracker) RecordRole(shardID string, peerID peer.ID, role peerRole) {
 	pt.mu.Lock()
 	if pt.roles[shardID] == nil {
-		pt.roles[shardID] = make(map[peer.ID]PeerRoleInfo)
+		pt.roles[shardID] = make(map[peer.ID]peerRoleInfo)
 	}
-	pt.roles[shardID][peerID] = PeerRoleInfo{Role: role, LastSeen: time.Now()}
+	pt.roles[shardID][peerID] = peerRoleInfo{role: role, lastSeen: time.Now()}
 	pt.mu.Unlock()
 }
 
-// RemoveRole removes a peer's role entry (e.g. on LEAVE).
-func (pt *PeerTracker) RemoveRole(shardID string, peerID peer.ID) {
+func (pt *peerTracker) RemoveRole(shardID string, peerID peer.ID) {
 	pt.mu.Lock()
 	if pt.roles[shardID] != nil {
 		delete(pt.roles[shardID], peerID)
@@ -72,9 +65,7 @@ func (pt *PeerTracker) RemoveRole(shardID string, peerID peer.ID) {
 	pt.mu.Unlock()
 }
 
-// CountActive returns the number of ACTIVE or REPLICATOR peers in the given shard.
-// When includeSelf is true and shardID matches currentShard, adds 1 for self.
-func (pt *PeerTracker) CountActive(shardID string, includeSelf bool, currentShard string, activeWindow time.Duration) int {
+func (pt *peerTracker) CountActive(shardID string, includeSelf bool, currentShard string, activeWindow time.Duration) int {
 	pt.mu.RLock()
 	roles, ok := pt.roles[shardID]
 	if !ok {
@@ -87,7 +78,7 @@ func (pt *PeerTracker) CountActive(shardID string, includeSelf bool, currentShar
 	cutoff := time.Now().Add(-activeWindow)
 	n := 0
 	for pid, info := range roles {
-		if (info.Role != RoleActive && info.Role != RoleReplicator) || info.LastSeen.Before(cutoff) || pid == pt.selfID {
+		if (info.role != roleActive && info.role != roleReplicator) || info.lastSeen.Before(cutoff) || pid == pt.selfID {
 			continue
 		}
 		n++
@@ -100,8 +91,7 @@ func (pt *PeerTracker) CountActive(shardID string, includeSelf bool, currentShar
 	return n
 }
 
-// GetActiveForShard returns ACTIVE or REPLICATOR peer IDs for the given shard (excluding self).
-func (pt *PeerTracker) GetActiveForShard(shardID string, activeWindow time.Duration) []peer.ID {
+func (pt *peerTracker) GetActiveForShard(shardID string, activeWindow time.Duration) []peer.ID {
 	pt.mu.RLock()
 	defer pt.mu.RUnlock()
 	roles, ok := pt.roles[shardID]
@@ -111,15 +101,14 @@ func (pt *PeerTracker) GetActiveForShard(shardID string, activeWindow time.Durat
 	cutoff := time.Now().Add(-activeWindow)
 	var active []peer.ID
 	for p, info := range roles {
-		if (info.Role == RoleActive || info.Role == RoleReplicator) && info.LastSeen.After(cutoff) && p != pt.selfID {
+		if (info.role == roleActive || info.role == roleReplicator) && info.lastSeen.After(cutoff) && p != pt.selfID {
 			active = append(active, p)
 		}
 	}
 	return active
 }
 
-// GetSeenPeers returns all peers seen in a shard within the cutoff window (excluding self).
-func (pt *PeerTracker) GetSeenPeers(shardID string, activeWindow time.Duration) map[peer.ID]struct{} {
+func (pt *peerTracker) GetSeenPeers(shardID string, activeWindow time.Duration) map[peer.ID]struct{} {
 	pt.mu.RLock()
 	defer pt.mu.RUnlock()
 	cutoff := time.Now().Add(-activeWindow)
@@ -134,16 +123,14 @@ func (pt *PeerTracker) GetSeenPeers(shardID string, activeWindow time.Duration)
 	return result
 }
 
-// HasRoles returns true if any role data exists for the given shard.
-func (pt *PeerTracker) HasRoles(shardID string) bool {
+func (pt *peerTracker) HasRoles(shardID string) bool {
 	pt.mu.RLock()
 	defer pt.mu.RUnlock()
 	_, ok := pt.roles[shardID]
 	return ok
 }
 
-// PruneStale removes peers not seen within the given duration.
-func (pt *PeerTracker) PruneStale(maxAge time.Duration) {
+func (pt *peerTracker) PruneStale(maxAge time.Duration) {
 	pt.mu.Lock()
 	defer pt.mu.Unlock()
 	cutoff := time.Now().Add(-maxAge)
@@ -159,7 +146,7 @@ func (pt *PeerTracker) PruneStale(maxAge time.Duration) {
 	}
 	for shardID, roles := range pt.roles {
 		for peerID, info := range roles {
-			if info.LastSeen.Before(cutoff) {
+			if info.lastSeen.Before(cutoff) {
 				delete(roles, peerID)
 			}
 		}
@@ -169,76 +156,66 @@ func (pt *PeerTracker) PruneStale(maxAge time.Duration) {
 	}
 }
 
-// parseHeartbeatRole extracts role from HEARTBEAT:pid:count or HEARTBEAT:pid:count:ROLE.
-func parseHeartbeatRole(data []byte) PeerRole {
+func parseHeartbeatRole(data []byte) peerRole {
 	s := string(data)
 	if !strings.HasPrefix(s, msgPrefixHeartbeat) {
-		return RoleActive
+		return roleActive
 	}
 	parts := strings.SplitN(s, ":", 4)
 	if len(parts) >= 4 {
-		r := PeerRole(strings.ToUpper(parts[3]))
+		r := peerRole(strings.ToUpper(parts[3]))
 		switch r {
-		case RolePassive, RoleProbe, RoleReplicator:
+		case rolePassive, roleProbe, roleReplicator:
 			return r
 		}
 	}
-	return RoleActive
+	return roleActive
 }
 
-// parseJoinRole extracts role from JOIN:pid or JOIN:pid:ROLE.
-func parseJoinRole(data []byte) PeerRole {
+func parseJoinRole(data []byte) peerRole {
 	s := string(data)
 	if !strings.HasPrefix(s, msgPrefixJoin) {
-		return RoleActive
+		return roleActive
 	}
 	parts := strings.SplitN(s, ":", 3)
 	if len(parts) >= 3 {
-		r := PeerRole(strings.ToUpper(parts[2]))
+		r := peerRole(strings.ToUpper(parts[2]))
 		switch r {
-		case RolePassive, RoleReplicator:
+		case rolePassive, roleReplicator:
 			return r
 		}
 	}
-	return RoleActive
+	return roleActive
 }
 
-// getOurRole returns the node's effective role based on storage capacity and ingest authorization.
-func (sm *ShardManager) getOurRole() PeerRole {
+func (sm *ShardManager) getOurRole() peerRole {
 	if !sm.storageMgr.CanAcceptCustodialFile() {
-		return RolePassive
+		return rolePassive
 	}
 	if !sm.IsLocalNodeIngestor() {
-		return RoleReplicator
+		return roleReplicator
 	}
-	return RoleActive
+	return roleActive
 }
 
 // processTextProtocolForProbe updates PeerTracker for HEARTBEAT/JOIN/LEAVE/PROBE.
 // Used when probing a shard to collect role info without full message handling.
-func (sm *ShardManager) processTextProtocolForProbe(msg *pubsub.Message, shardID string) bool {
+func (sm *ShardManager) processTextProtocolForProbe(msg *pubsub.Message, shardID string) {
 	data := msg.Data
 	if len(data) == 0 {
-		return false
+		return
 	}
 	from := msg.GetFrom()
 	sm.peers.RecordSeen(shardID, from)
 
-	if bytes.HasPrefix(data, []byte(msgPrefixHeartbeat)) {
+	switch {
+	case bytes.HasPrefix(data, []byte(msgPrefixHeartbeat)):
 		sm.peers.RecordRole(shardID, from, parseHeartbeatRole(data))
-		return true
-	}
-	if bytes.HasPrefix(data, []byte(msgPrefixJoin)) {
+	case bytes.HasPrefix(data, []byte(msgPrefixJoin)):
 		sm.peers.RecordRole(shardID, from, parseJoinRole(data))
-		return true
-	}
-	if bytes.HasPrefix(data, []byte(msgPrefixLeave)) {
+	case bytes.HasPrefix(data, []byte(msgPrefixLeave)):
 		sm.peers.RemoveRole(shardID, from)
-		return true
-	}
-	if bytes.HasPrefix(data, []byte(msgPrefixProbe)) {
-		sm.peers.RecordRole(shardID, from, RoleProbe)
-		return true
+	case bytes.HasPrefix(data, []byte(msgPrefixProbe)):
+		sm.peers.RecordRole(shardID, from, roleProbe)
 	}
-	return false
 }
diff --git a/internal/managers/shard/shard_split.go b/internal/managers/shard/shard_split.go
index 4fa060a..c8a9cfb 100644
--- a/internal/managers/shard/shard_split.go
+++ b/internal/managers/shard/shard_split.go
@@ -56,8 +56,3 @@ func (sm *ShardManager) rebroadcastSplitToAncestors() {
 		ancestor = ancestor[:len(ancestor)-1]
 	}
 }
-
-// splitShard moves this node to its target child. For tests; normal path uses lifecycle.checkAndSplitIfNeeded.
-func (sm *ShardManager) splitShard() {
-	sm.lifecycle.splitShard()
-}
diff --git a/internal/managers/shard/shard_split_test.go b/internal/managers/shard/shard_split_test.go
index d175086..e1f8a77 100644
--- a/internal/managers/shard/shard_split_test.go
+++ b/internal/managers/shard/shard_split_test.go
@@ -11,7 +11,6 @@ import (
 	"dlockss/internal/common"
 	"dlockss/internal/config"
 	"dlockss/internal/managers/storage"
-	"dlockss/internal/telemetry"
 	"dlockss/internal/testutil"
 )
 
@@ -33,9 +32,8 @@ func TestSplitShard_NoDeadlock(t *testing.T) {
 	}
 
 	// Setup Dependencies
-	metrics := telemetry.NewMetricsManager(config.DefaultConfig())
 	dht := &testutil.MockDHTProvider{}
-	storageMgr := storage.NewStorageManager(config.DefaultConfig(), dht, metrics, nil)
+	storageMgr := storage.NewStorageManager(config.DefaultConfig(), dht, nil)
 	ipfsClient := &testutil.MockIPFSClient{}
 
 	clusterMgr := &testutil.MockClusterManager{}
@@ -46,20 +44,18 @@ func TestSplitShard_NoDeadlock(t *testing.T) {
 		PubSub:     ps,
 		IPFSClient: ipfsClient,
 		Storage:    storageMgr,
-		Metrics:    metrics,
 		Cluster:    clusterMgr,
 	})
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	// Register shard info with metrics to simulate production setup
-	metrics.RegisterProviders(sm, storageMgr, nil)
-
-	// Trigger splitShard
+	// Trigger split: compute target child and move
 	done := make(chan struct{})
 	go func() {
-		sm.splitShard()
+		currentShard := sm.getCurrentShard()
+		targetChild := common.GetBinaryPrefix(sm.h.ID().String(), len(currentShard)+1)
+		sm.moveToShard(currentShard, targetChild, false)
 		close(done)
 	}()
 
@@ -72,7 +68,7 @@ func TestSplitShard_NoDeadlock(t *testing.T) {
 	}
 
 	// Verify state changed
-	currentShard, _ := sm.GetShardInfo()
+	currentShard := sm.GetShardInfo()
 	expectedShard := common.GetBinaryPrefix(h.ID().String(), 1)
 	if currentShard != expectedShard {
 		t.Errorf("expected shard %s, got %s", expectedShard, currentShard)
diff --git a/internal/managers/storage/backoff.go b/internal/managers/storage/backoff.go
deleted file mode 100644
index 01cf9d0..0000000
--- a/internal/managers/storage/backoff.go
+++ /dev/null
@@ -1,121 +0,0 @@
-package storage
-
-import (
-	"crypto/rand"
-	"math/big"
-	"sync"
-	"time"
-)
-
-// BackoffTable tracks exponential backoff delays for failed operations.
-type BackoffTable struct {
-	mu                sync.RWMutex
-	m                 map[string]*operationBackoff
-	initialDelay      time.Duration
-	maxDelay          time.Duration
-	backoffMultiplier float64
-}
-
-func newBackoffTable(initialDelay, maxDelay time.Duration, multiplier float64) *BackoffTable {
-	return &BackoffTable{
-		m:                 make(map[string]*operationBackoff),
-		initialDelay:      initialDelay,
-		maxDelay:          maxDelay,
-		backoffMultiplier: multiplier,
-	}
-}
-
-func (bt *BackoffTable) shouldSkip(key string) bool {
-	bt.mu.RLock()
-	defer bt.mu.RUnlock()
-
-	backoff, exists := bt.m[key]
-	if !exists {
-		return false
-	}
-
-	backoff.mu.Lock()
-	defer backoff.mu.Unlock()
-
-	return time.Now().Before(backoff.nextRetry)
-}
-
-func (bt *BackoffTable) recordFailure(key string) {
-	bt.mu.Lock()
-	defer bt.mu.Unlock()
-
-	backoff, exists := bt.m[key]
-	if !exists {
-		backoff = &operationBackoff{
-			delay: bt.initialDelay,
-		}
-		bt.m[key] = backoff
-	}
-
-	backoff.mu.Lock()
-	defer backoff.mu.Unlock()
-
-	backoff.delay = time.Duration(float64(backoff.delay) * bt.backoffMultiplier)
-	if backoff.delay > bt.maxDelay {
-		backoff.delay = bt.maxDelay
-	}
-
-	const backoffJitterFraction = 0.25
-	jitterRange := float64(backoff.delay) * backoffJitterFraction
-	jitterRangeInt := int64(jitterRange * 2)
-	if jitterRangeInt > 0 {
-		jitterVal, err := rand.Int(rand.Reader, big.NewInt(jitterRangeInt))
-		if err == nil {
-			jitter := time.Duration(jitterVal.Int64()) - time.Duration(jitterRange)
-			jitteredDelay := backoff.delay + jitter
-			if jitteredDelay < bt.initialDelay {
-				jitteredDelay = bt.initialDelay
-			}
-			backoff.nextRetry = time.Now().Add(jitteredDelay)
-		} else {
-			backoff.nextRetry = time.Now().Add(backoff.delay)
-		}
-	} else {
-		backoff.nextRetry = time.Now().Add(backoff.delay)
-	}
-}
-
-func (bt *BackoffTable) clear(key string) {
-	bt.mu.Lock()
-	defer bt.mu.Unlock()
-
-	if backoff, exists := bt.m[key]; exists {
-		backoff.mu.Lock()
-		backoff.delay = bt.initialDelay
-		backoff.nextRetry = time.Time{}
-		backoff.mu.Unlock()
-	}
-}
-
-func (bt *BackoffTable) size() int {
-	bt.mu.RLock()
-	defer bt.mu.RUnlock()
-	return len(bt.m)
-}
-
-// purgeExpired removes entries whose nextRetry has passed.
-func (bt *BackoffTable) purgeExpired() {
-	bt.mu.Lock()
-	defer bt.mu.Unlock()
-
-	now := time.Now()
-	for key, backoff := range bt.m {
-		backoff.mu.Lock()
-		expired := !backoff.nextRetry.IsZero() && now.After(backoff.nextRetry)
-		backoff.mu.Unlock()
-		if expired {
-			delete(bt.m, key)
-		}
-	}
-}
-
-type operationBackoff struct {
-	nextRetry time.Time
-	delay     time.Duration
-	mu        sync.Mutex
-}
diff --git a/internal/managers/storage/backoff_test.go b/internal/managers/storage/backoff_test.go
deleted file mode 100644
index 8af81db..0000000
--- a/internal/managers/storage/backoff_test.go
+++ /dev/null
@@ -1,132 +0,0 @@
-package storage
-
-import (
-	"testing"
-	"time"
-)
-
-func TestNewBackoffTable(t *testing.T) {
-	initial := 10 * time.Millisecond
-	max := 5 * time.Minute
-	multiplier := 2.0
-
-	bt := newBackoffTable(initial, max, multiplier)
-	if bt == nil {
-		t.Fatal("newBackoffTable returned nil")
-	}
-	if bt.size() != 0 {
-		t.Errorf("new table size = %d, want 0", bt.size())
-	}
-}
-
-func TestRecordFailureIncrementsTable(t *testing.T) {
-	bt := newBackoffTable(10*time.Millisecond, time.Minute, 2.0)
-
-	if bt.size() != 0 {
-		t.Errorf("initial size = %d, want 0", bt.size())
-	}
-
-	bt.recordFailure("key1")
-	if bt.size() != 1 {
-		t.Errorf("after 1 failure size = %d, want 1", bt.size())
-	}
-
-	bt.recordFailure("key1")
-	if bt.size() != 1 {
-		t.Errorf("after 2 failures same key size = %d, want 1", bt.size())
-	}
-
-	bt.recordFailure("key2")
-	if bt.size() != 2 {
-		t.Errorf("after adding key2 size = %d, want 2", bt.size())
-	}
-}
-
-func TestShouldSkipReturnsFalseInitially(t *testing.T) {
-	bt := newBackoffTable(10*time.Millisecond, time.Minute, 2.0)
-
-	if bt.shouldSkip("nonexistent") {
-		t.Error("shouldSkip for unknown key should be false, got true")
-	}
-}
-
-func TestShouldSkipReturnsTrueForRecentlyFailedKeys(t *testing.T) {
-	bt := newBackoffTable(50*time.Millisecond, time.Minute, 2.0)
-
-	bt.recordFailure("key1")
-	if !bt.shouldSkip("key1") {
-		t.Error("shouldSkip for recently failed key should be true, got false")
-	}
-}
-
-func TestClearResetsBackoff(t *testing.T) {
-	bt := newBackoffTable(50*time.Millisecond, time.Minute, 2.0)
-
-	bt.recordFailure("key1")
-	if !bt.shouldSkip("key1") {
-		t.Error("before clear: shouldSkip should be true")
-	}
-
-	bt.clear("key1")
-	if bt.shouldSkip("key1") {
-		t.Error("after clear: shouldSkip should be false")
-	}
-}
-
-func TestSizeReturnsCorrectCount(t *testing.T) {
-	bt := newBackoffTable(10*time.Millisecond, time.Minute, 2.0)
-
-	keys := []string{"a", "b", "c"}
-	for i, k := range keys {
-		bt.recordFailure(k)
-		if got := bt.size(); got != i+1 {
-			t.Errorf("after adding %q size = %d, want %d", k, got, i+1)
-		}
-	}
-
-	if got := bt.size(); got != 3 {
-		t.Errorf("final size = %d, want 3", got)
-	}
-}
-
-func TestBackoffDelayGrowsExponentially(t *testing.T) {
-	initial := 1 * time.Millisecond
-	max := 100 * time.Millisecond
-	multiplier := 2.0
-	bt := newBackoffTable(initial, max, multiplier)
-
-	// After 1 failure: delay ~2ms. After 5 failures: delay ~32ms (2^5).
-	// Sleep 5ms: 1 failure would have expired, 5 failures would not.
-	bt.recordFailure("key1")
-	time.Sleep(5 * time.Millisecond)
-	if bt.shouldSkip("key1") {
-		t.Error("after 1 failure and 5ms: shouldSkip should be false (short delay)")
-	}
-
-	bt.recordFailure("key2")
-	bt.recordFailure("key2")
-	bt.recordFailure("key2")
-	bt.recordFailure("key2")
-	bt.recordFailure("key2")
-	time.Sleep(5 * time.Millisecond)
-	if !bt.shouldSkip("key2") {
-		t.Error("after 5 failures and 5ms: shouldSkip should be true (exponential delay)")
-	}
-}
-
-func TestPurgeExpiredRemovesOldEntries(t *testing.T) {
-	bt := newBackoffTable(1*time.Millisecond, time.Minute, 2.0)
-
-	bt.recordFailure("key1")
-	bt.recordFailure("key2")
-	if bt.size() != 2 {
-		t.Fatalf("before purge size = %d, want 2", bt.size())
-	}
-
-	time.Sleep(20 * time.Millisecond)
-	bt.purgeExpired()
-
-	if bt.size() != 0 {
-		t.Errorf("after purgeExpired size = %d, want 0", bt.size())
-	}
-}
diff --git a/internal/managers/storage/storage.go b/internal/managers/storage/storage.go
index 7e3fa4a..b09819c 100644
--- a/internal/managers/storage/storage.go
+++ b/internal/managers/storage/storage.go
@@ -7,25 +7,22 @@ import (
 	"sync"
 	"time"
 
+	"github.com/ipfs/go-cid"
+
 	"dlockss/internal/badbits"
 	"dlockss/internal/common"
 	"dlockss/internal/config"
-	"dlockss/internal/telemetry"
 )
 
 // StorageManager handles local file state and DHT announcements.
 type StorageManager struct {
-	cfg                   *config.Config
-	dht                   common.DHTProvider
-	badBits               *badbits.Filter
-	disk                  *DiskMonitor
-	pinnedFiles           *common.PinnedSet
-	knownFiles            *common.KnownFiles
-	recentlyRemoved       *common.RecentlyRemoved
-	fileReplicationLevels *common.FileReplicationLevels
-	failedOperations      *BackoffTable
-	metrics               *telemetry.MetricsManager
-	provideSem            chan struct{}
+	cfg         *config.Config
+	dht         common.DHTProvider
+	badBits     *badbits.Filter
+	disk        *DiskMonitor
+	pinnedFiles *common.PinnedSet
+	knownFiles  *common.KnownFiles
+	provideSem  chan struct{}
 
 	announceMu        sync.Mutex
 	announceIndex     int
@@ -34,23 +31,19 @@ type StorageManager struct {
 }
 
 // NewStorageManager creates a new StorageManager.
-func NewStorageManager(cfg *config.Config, dht common.DHTProvider, metrics *telemetry.MetricsManager, badBits *badbits.Filter) *StorageManager {
-	maxProvides := cfg.MaxConcurrentDHTProvides
+func NewStorageManager(cfg *config.Config, dht common.DHTProvider, badBits *badbits.Filter) *StorageManager {
+	maxProvides := cfg.Files.MaxConcurrentDHTProvides
 	if maxProvides < 1 {
 		maxProvides = 8
 	}
 	return &StorageManager{
-		cfg:                   cfg,
-		dht:                   dht,
-		badBits:               badBits,
-		disk:                  NewDiskMonitor(cfg.FileWatchFolder, cfg.DiskUsageHighWaterMark),
-		pinnedFiles:           common.NewPinnedSet(),
-		knownFiles:            common.NewKnownFiles(),
-		recentlyRemoved:       common.NewRecentlyRemoved(),
-		fileReplicationLevels: common.NewFileReplicationLevels(),
-		failedOperations:      newBackoffTable(cfg.InitialBackoffDelay, cfg.MaxBackoffDelay, cfg.BackoffMultiplier),
-		metrics:               metrics,
-		provideSem:            make(chan struct{}, maxProvides),
+		cfg:         cfg,
+		dht:         dht,
+		badBits:     badBits,
+		disk:        NewDiskMonitor(cfg.FileWatchFolder, cfg.DiskUsageHighWaterMark),
+		pinnedFiles: common.NewPinnedSet(),
+		knownFiles:  common.NewKnownFiles(),
+		provideSem:  make(chan struct{}, maxProvides),
 	}
 }
 
@@ -114,9 +107,6 @@ func (sm *StorageManager) PinFile(manifestCIDStr string) bool {
 		sm.announceMu.Lock()
 		sm.announceKeysDirty = true
 		sm.announceMu.Unlock()
-		if sm.metrics != nil {
-			sm.metrics.SetPinnedFilesCount(sm.pinnedFiles.Size())
-		}
 		slog.Info("pinned manifest", "manifest", manifestCIDStr, "total", sm.pinnedFiles.Size())
 	} else {
 		slog.Debug("manifest already pinned, timestamp updated", "manifest", manifestCIDStr, "total", sm.pinnedFiles.Size())
@@ -132,9 +122,6 @@ func (sm *StorageManager) UnpinFile(key string) {
 		sm.announceMu.Lock()
 		sm.announceKeysDirty = true
 		sm.announceMu.Unlock()
-		if sm.metrics != nil {
-			sm.metrics.SetPinnedFilesCount(sm.pinnedFiles.Size())
-		}
 		slog.Info("unpinned file", "key", key, "pinned_for", time.Since(pinTime), "remaining", sm.pinnedFiles.Size())
 	} else {
 		slog.Warn("attempted to unpin file that was not pinned", "key", key)
@@ -148,35 +135,7 @@ func (sm *StorageManager) IsPinned(key string) bool {
 
 // AddKnownFile adds a file/manifest to the known files set.
 func (sm *StorageManager) AddKnownFile(key string) {
-	removedTime, wasRemoved := sm.recentlyRemoved.WasRemoved(key)
-	if wasRemoved && time.Since(removedTime) < sm.cfg.RemovedFileCooldown {
-		return
-	}
-
-	if sm.knownFiles.Add(key) {
-		if sm.metrics != nil {
-			sm.metrics.SetKnownFilesCount(sm.knownFiles.Size())
-		}
-	}
-
-	// Ensure we track replication level for new files, starting at 0 (or 1 if pinned)
-	if sm.pinnedFiles.Has(key) {
-		sm.fileReplicationLevels.Set(key, 1)
-	} else {
-		sm.fileReplicationLevels.Set(key, 0)
-	}
-}
-
-// RemoveKnownFile removes a file/manifest from the known files set.
-func (sm *StorageManager) RemoveKnownFile(key string) {
-	sm.knownFiles.Remove(key)
-	if sm.metrics != nil {
-		sm.metrics.SetKnownFilesCount(sm.knownFiles.Size())
-	}
-
-	sm.fileReplicationLevels.Delete(key)
-
-	sm.recentlyRemoved.Record(key)
+	sm.knownFiles.Add(key)
 }
 
 // ProvideFile announces a file/manifest to the DHT.
@@ -187,7 +146,7 @@ func (sm *StorageManager) ProvideFile(ctx context.Context, key string) {
 	if sm.dht == nil {
 		return
 	}
-	c, err := common.KeyToCID(key)
+	c, err := cid.Decode(key)
 	if err != nil {
 		slog.Error("failed to convert key to CID", "key", key, "error", err)
 		return
@@ -203,32 +162,10 @@ func (sm *StorageManager) ProvideFile(ctx context.Context, key string) {
 
 	if err := sm.dht.Provide(ctx, c, true); err != nil {
 		slog.Warn("failed to provide file to DHT", "key", key, "error", err)
-		sm.RecordFailedOperation(key)
 		return
 	}
 }
 
-// RecordFailedOperation records a failure for exponential backoff
-func (sm *StorageManager) RecordFailedOperation(key string) {
-	sm.failedOperations.recordFailure(key)
-}
-
-func (sm *StorageManager) ClearFailedOperation(key string) {
-	sm.failedOperations.clear(key)
-}
-
-func (sm *StorageManager) SetReplicationLevel(key string, count int) {
-	sm.fileReplicationLevels.Set(key, count)
-}
-
-func (sm *StorageManager) GetReplicationLevels() map[string]int {
-	return sm.fileReplicationLevels.Snapshot()
-}
-
-func (sm *StorageManager) GetKnownFiles() *common.KnownFiles {
-	return sm.knownFiles
-}
-
 // GetAllKnownFiles returns a snapshot of all known file keys.
 func (sm *StorageManager) GetAllKnownFiles() map[string]bool {
 	return sm.knownFiles.All()
@@ -241,18 +178,3 @@ func (sm *StorageManager) GetPinTime(key string) time.Time {
 func (sm *StorageManager) GetPinnedCount() int {
 	return sm.pinnedFiles.Size()
 }
-
-// GetStorageStatus returns a snapshot of current storage state.
-func (sm *StorageManager) GetStorageStatus() common.StorageSnapshot {
-	allKnown := sm.knownFiles.All()
-	knownCIDs := make([]string, 0, len(allKnown))
-	for k := range allKnown {
-		knownCIDs = append(knownCIDs, k)
-	}
-	return common.StorageSnapshot{
-		PinnedCount:  sm.pinnedFiles.Size(),
-		KnownCount:   sm.knownFiles.Size(),
-		KnownCIDs:    knownCIDs,
-		BackoffCount: sm.failedOperations.size(),
-	}
-}
diff --git a/internal/managers/storage/storage_monitor.go b/internal/managers/storage/storage_monitor.go
index 8f98ebe..ce2fca7 100644
--- a/internal/managers/storage/storage_monitor.go
+++ b/internal/managers/storage/storage_monitor.go
@@ -1,7 +1,6 @@
 package storage
 
 import (
-	"context"
 	"log/slog"
 	"sync"
 	"time"
@@ -24,51 +23,24 @@ func NewDiskMonitor(path string, highWaterMark float64) *DiskMonitor {
 	return &DiskMonitor{path: path, highWaterMark: highWaterMark}
 }
 
-func (dm *DiskMonitor) CheckDiskUsage() float64 {
+func (dm *DiskMonitor) CanAcceptCustodialFile() bool {
 	dm.mu.RLock()
 	lastCheck := dm.lastCheck
 	usage := dm.usagePercent
 	dm.mu.RUnlock()
 
-	if time.Since(lastCheck) < diskUsageCacheTTL {
-		return usage
-	}
-
-	newUsage, err := getDiskUsagePercent(dm.path)
-	if err != nil {
-		slog.Warn("failed to check disk usage", "error", err)
-		return usage
-	}
-
-	dm.mu.Lock()
-	dm.usagePercent = newUsage
-	dm.lastCheck = time.Now()
-	dm.mu.Unlock()
-
-	return newUsage
-}
-
-func (dm *DiskMonitor) IsDiskUsageHigh() bool {
-	return dm.CheckDiskUsage() >= dm.highWaterMark
-}
-
-func (dm *DiskMonitor) CanAcceptCustodialFile() bool {
-	return !dm.IsDiskUsageHigh()
-}
-
-func (dm *DiskMonitor) RunMonitor(ctx context.Context) {
-	ticker := time.NewTicker(30 * time.Second)
-	defer ticker.Stop()
-
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case <-ticker.C:
-			usage := dm.CheckDiskUsage()
-			if usage >= dm.highWaterMark {
-				slog.Warn("disk usage high, rejecting custodial files", "usage_pct", usage, "high_water_mark", dm.highWaterMark)
-			}
+	if time.Since(lastCheck) >= diskUsageCacheTTL {
+		newUsage, err := getDiskUsagePercent(dm.path)
+		if err != nil {
+			slog.Warn("failed to check disk usage", "error", err)
+		} else {
+			usage = newUsage
+			dm.mu.Lock()
+			dm.usagePercent = newUsage
+			dm.lastCheck = time.Now()
+			dm.mu.Unlock()
 		}
 	}
+
+	return usage < dm.highWaterMark
 }
diff --git a/internal/monitor/monitor_cleanup.go b/internal/monitor/monitor_cleanup.go
index 104504d..f471ebd 100644
--- a/internal/monitor/monitor_cleanup.go
+++ b/internal/monitor/monitor_cleanup.go
@@ -155,7 +155,6 @@ func (m *Monitor) cleanupStaleCIDs(ctx context.Context) {
 		case <-ctx.Done():
 			return
 		case <-ticker.C:
-			m.evictStaleGeoCache()
 			m.evictStalePeerstoreEntries()
 			cutoff := time.Now().Add(-30 * time.Minute)
 			m.mu.Lock()
diff --git a/internal/monitor/monitor_geo.go b/internal/monitor/monitor_geo.go
deleted file mode 100644
index b55afd6..0000000
--- a/internal/monitor/monitor_geo.go
+++ /dev/null
@@ -1,167 +0,0 @@
-package monitor
-
-import (
-	"encoding/json"
-	"log/slog"
-	"net"
-	"net/http"
-	"strings"
-	"time"
-
-	ma "github.com/multiformats/go-multiaddr"
-	"github.com/oschwald/geoip2-golang"
-)
-
-const (
-	geoCacheTTL = 24 * time.Hour
-)
-
-type geoCacheEntry struct {
-	region string
-	seen   time.Time
-}
-
-// openGeoIPDB opens a MaxMind-format .mmdb file for local geo lookups.
-// Returns nil if path is empty or the file cannot be opened.
-func openGeoIPDB(path string) *geoip2.Reader {
-	if path == "" {
-		return nil
-	}
-	db, err := geoip2.Open(path)
-	if err != nil {
-		slog.Error("failed to open geoip database", "path", path, "error", err)
-		return nil
-	}
-	slog.Info("geoip database loaded", "path", path)
-	return db
-}
-
-func (m *Monitor) lookupLocalDB(ipStr string) string {
-	ip := net.ParseIP(ipStr)
-	if ip == nil {
-		return ""
-	}
-	record, err := m.geoDB.City(ip)
-	if err != nil {
-		return ""
-	}
-	var subdiv string
-	if len(record.Subdivisions) > 0 {
-		subdiv = record.Subdivisions[0].Names["en"]
-	}
-	return formatGeoResult(record.Country.IsoCode, record.Country.Names["en"], subdiv)
-}
-
-func formatGeoResult(countryCode, countryName, region string) string {
-	var parts []string
-	if countryCode != "" {
-		parts = append(parts, countryCode)
-	}
-	if region != "" {
-		parts = append(parts, region)
-	}
-	if len(parts) == 0 {
-		if countryName != "" {
-			return countryName
-		}
-		return ""
-	}
-	return strings.Join(parts, " - ")
-}
-
-func isPrivateIP(ipStr string) bool {
-	ip := net.ParseIP(ipStr)
-	if ip == nil {
-		return false
-	}
-	if ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() {
-		return true
-	}
-	privateIPBlocks := []string{"10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"}
-	for _, cidr := range privateIPBlocks {
-		_, block, _ := net.ParseCIDR(cidr)
-		if block.Contains(ip) {
-			return true
-		}
-	}
-	return false
-}
-
-// preferPublicIP returns the first non-private IP from the list, or the first IP if all are private.
-// Use this when a peer has multiple addresses (e.g. LAN + public) so region/geo stays stable.
-func preferPublicIP(ips []string) string {
-	var fallback string
-	for _, ip := range ips {
-		if ip == "" {
-			continue
-		}
-		if fallback == "" {
-			fallback = ip
-		}
-		if !isPrivateIP(ip) {
-			return ip
-		}
-	}
-	return fallback
-}
-
-func (m *Monitor) evictStaleGeoCache() {
-	cutoff := time.Now().Add(-geoCacheTTL)
-	m.geoCache.Range(func(key, value interface{}) bool {
-		if entry, ok := value.(geoCacheEntry); ok && entry.seen.Before(cutoff) {
-			m.geoCache.Delete(key)
-		}
-		return true
-	})
-}
-
-// resolveGeoIPSync resolves an IP to a region string synchronously.
-// Uses local DB if available, otherwise cache, otherwise a direct HTTP call.
-func (m *Monitor) resolveGeoIPSync(ipStr string) string {
-	if ipStr == "" || isPrivateIP(ipStr) {
-		return ""
-	}
-	if m.geoDB != nil {
-		return m.lookupLocalDB(ipStr)
-	}
-	if entry, ok := m.geoCache.Load(ipStr); ok {
-		return entry.(geoCacheEntry).region
-	}
-	client := &http.Client{Timeout: 5 * time.Second}
-	resp, err := client.Get("http://ip-api.com/json/" + ipStr + "?fields=status,countryCode,regionName,query")
-	if err != nil {
-		return ""
-	}
-	defer resp.Body.Close()
-	if resp.StatusCode != http.StatusOK {
-		return ""
-	}
-	var result struct {
-		Status      string `json:"status"`
-		CountryCode string `json:"countryCode"`
-		RegionName  string `json:"regionName"`
-	}
-	if err := json.NewDecoder(resp.Body).Decode(&result); err != nil || result.Status != "success" {
-		return ""
-	}
-	region := formatGeoResult(result.CountryCode, "", result.RegionName)
-	if region != "" {
-		m.geoCache.Store(ipStr, geoCacheEntry{region: region, seen: time.Now()})
-	}
-	return region
-}
-
-// resolveRegionFromAddrs extracts IPs from multiaddrs and resolves the region synchronously.
-func (m *Monitor) resolveRegionFromAddrs(addrs []ma.Multiaddr) string {
-	var ips []string
-	for _, addr := range addrs {
-		if ipVal, err := addr.ValueForProtocol(ma.P_IP4); err == nil {
-			ips = append(ips, ipVal)
-		}
-		if ipVal, err := addr.ValueForProtocol(ma.P_IP6); err == nil {
-			ips = append(ips, ipVal)
-		}
-	}
-	ip := preferPublicIP(ips)
-	return m.resolveGeoIPSync(ip)
-}
diff --git a/internal/monitor/monitor_handlers.go b/internal/monitor/monitor_handlers.go
index e9a6709..6b58676 100644
--- a/internal/monitor/monitor_handlers.go
+++ b/internal/monitor/monitor_handlers.go
@@ -35,10 +35,10 @@ func (m *Monitor) handleIngestMessage(ctx context.Context, im *schema.IngestMess
 		return
 	}
 
-	nodeState, exists := m.nodes[peerIDStr]
+	ns, exists := m.nodes[peerIDStr]
 	if !exists {
 		slog.Info("new node discovered via ingest message", "peer", peerIDStr, "shard", shardID)
-		nodeState = &NodeState{
+		ns = &nodeState{
 			PeerID:         peerIDStr,
 			CurrentShard:   shardID,
 			PinnedFiles:    0,
@@ -48,27 +48,27 @@ func (m *Monitor) handleIngestMessage(ctx context.Context, im *schema.IngestMess
 			IPAddress:      ip,
 			announcedFiles: make(map[string]time.Time),
 		}
-		m.nodes[peerIDStr] = nodeState
+		m.nodes[peerIDStr] = ns
 		m.nodeFiles[peerIDStr] = make(map[string]time.Time)
 		m.treeDirty = true
 	}
 
-	nodeState.LastSeen = now
+	ns.LastSeen = now
 	manifestCIDStr := im.ManifestCID.String()
 
-	if nodeState.announcedFiles == nil {
-		nodeState.announcedFiles = make(map[string]time.Time)
+	if ns.announcedFiles == nil {
+		ns.announcedFiles = make(map[string]time.Time)
 	}
-	nodeState.announcedFiles[manifestCIDStr] = now
+	ns.announcedFiles[manifestCIDStr] = now
 
 	if m.nodeFiles[peerIDStr] == nil {
 		m.nodeFiles[peerIDStr] = make(map[string]time.Time)
 	}
 	m.nodeFiles[peerIDStr][manifestCIDStr] = now
 
-	nodeState.KnownFiles = len(nodeState.announcedFiles)
-	if n := len(nodeState.announcedFiles); n > nodeState.PinnedFiles {
-		nodeState.PinnedFiles = n
+	ns.KnownFiles = len(ns.announcedFiles)
+	if n := len(ns.announcedFiles); n > ns.PinnedFiles {
+		ns.PinnedFiles = n
 	}
 	m.uniqueCIDs[manifestCIDStr] = now
 
@@ -76,7 +76,6 @@ func (m *Monitor) handleIngestMessage(ctx context.Context, im *schema.IngestMess
 		m.manifestReplication[manifestCIDStr] = make(map[string]time.Time)
 	}
 	m.manifestReplication[manifestCIDStr][peerIDStr] = now
-	// Prefer deeper shard in same subtree; ignore sibling-shard announcements.
 	if existing, ok := m.manifestShard[manifestCIDStr]; !ok || (len(shardID) > len(existing) && strings.HasPrefix(shardID, existing)) {
 		m.manifestShard[manifestCIDStr] = shardID
 	}
@@ -86,8 +85,8 @@ func (m *Monitor) handleIngestMessage(ctx context.Context, im *schema.IngestMess
 		m.ensureShardSubscriptionUnlocked(context.Background(), shardID)
 	}
 
-	if ip != "" && ip != nodeState.IPAddress {
-		nodeState.IPAddress = ip
+	if ip != "" && ip != ns.IPAddress {
+		ns.IPAddress = ip
 	}
 }
 
@@ -106,10 +105,6 @@ func (m *Monitor) setPeerShardLastSeenUnlocked(peerIDStr, shardID string, t time
 	m.peerShardLastSeen[peerIDStr][shardID] = t
 }
 
-func (m *Monitor) handleHeartbeat(ctx context.Context, senderID peer.ID, shardID string, ip string, pinnedCount int) {
-	m.handleHeartbeatWithRole(ctx, senderID, shardID, ip, pinnedCount, "", "")
-}
-
 func (m *Monitor) handleHeartbeatWithRole(ctx context.Context, senderID peer.ID, shardID string, ip string, pinnedCount int, role string, nodeName string) (shardUpdated bool) {
 	now := time.Now()
 	peerIDStr := senderID.String()
@@ -125,14 +120,14 @@ func (m *Monitor) handleHeartbeatWithRole(ctx context.Context, senderID peer.ID,
 
 	m.setPeerShardLastSeenUnlocked(peerIDStr, shardID, now)
 
-	nodeState, exists := m.nodes[peerIDStr]
+	ns, exists := m.nodes[peerIDStr]
 	if !exists {
 		logName := peerIDStr
 		if nodeName != "" {
 			logName = nodeName + " (" + peerIDStr + ")"
 		}
 		slog.Info("new node discovered via heartbeat", "peer", logName, "shard", shardLabel(shardID), "pinned", pinnedCount, "role", role)
-		nodeState = &NodeState{
+		ns = &nodeState{
 			PeerID:         peerIDStr,
 			NodeName:       nodeName,
 			CurrentShard:   shardID,
@@ -144,24 +139,22 @@ func (m *Monitor) handleHeartbeatWithRole(ctx context.Context, senderID peer.ID,
 			IPAddress:      ip,
 			announcedFiles: make(map[string]time.Time),
 		}
-		m.nodes[peerIDStr] = nodeState
+		m.nodes[peerIDStr] = ns
 		m.treeDirty = true
 		return true
 	}
-	nodeState.LastSeen = now
-	nodeState.Role = role
+	ns.LastSeen = now
+	ns.Role = role
 	if nodeName != "" {
-		nodeState.NodeName = nodeName
+		ns.NodeName = nodeName
 	}
 	if pinnedCount >= 0 {
-		nodeState.PinnedFiles = pinnedCount
+		ns.PinnedFiles = pinnedCount
 		if pinnedCount == 0 {
 			firstSeen := now
-			if len(nodeState.ShardHistory) > 0 {
-				firstSeen = nodeState.ShardHistory[0].FirstSeen
+			if len(ns.ShardHistory) > 0 {
+				firstSeen = ns.ShardHistory[0].FirstSeen
 			}
-			// Ignore pinned=0 during grace period: stale heartbeats can arrive
-			// before the node finishes its first pin cycle.
 			if now.Sub(firstSeen) >= unpinGracePeriod {
 				removedFromManifests := 0
 				for manifest, peers := range m.manifestReplication {
@@ -178,8 +171,6 @@ func (m *Monitor) handleHeartbeatWithRole(ctx context.Context, senderID peer.ID,
 				}
 			}
 		} else {
-			// Peer is alive and pinning: refresh manifestReplication timestamps
-			// so entries don't expire between PINNED re-announcements.
 			for _, peers := range m.manifestReplication {
 				if _, ok := peers[peerIDStr]; ok {
 					peers[peerIDStr] = now
@@ -187,16 +178,16 @@ func (m *Monitor) handleHeartbeatWithRole(ctx context.Context, senderID peer.ID,
 			}
 		}
 	}
-	if nodeState.CurrentShard == "" {
-		nodeState.CurrentShard = shardID
-		nodeState.ShardHistory = append(nodeState.ShardHistory, ShardHistoryEntry{ShardID: shardID, FirstSeen: now})
+	if ns.CurrentShard == "" {
+		ns.CurrentShard = shardID
+		ns.ShardHistory = append(ns.ShardHistory, ShardHistoryEntry{ShardID: shardID, FirstSeen: now})
 		m.treeDirty = true
 		shardUpdated = true
 	} else {
-		shardUpdated = m.updateNodeShardLocked(nodeState, shardID, now)
+		shardUpdated = m.updateNodeShardLocked(ns, shardID, now)
 	}
-	if ip != "" && ip != nodeState.IPAddress {
-		nodeState.IPAddress = ip
+	if ip != "" && ip != ns.IPAddress {
+		ns.IPAddress = ip
 	}
 	return shardUpdated
 }
@@ -221,7 +212,7 @@ func (m *Monitor) handleLeaveShard(ctx context.Context, peerID peer.ID, shardID
 	}
 }
 
-func (m *Monitor) updateNodeShardLocked(node *NodeState, newShard string, timestamp time.Time) bool {
+func (m *Monitor) updateNodeShardLocked(node *nodeState, newShard string, timestamp time.Time) bool {
 	if len(node.ShardHistory) == 0 {
 		return false
 	}
@@ -303,7 +294,7 @@ func (m *Monitor) updateNodeShardLocked(node *NodeState, newShard string, timest
 		slog.Info("shard move removed peer from manifests", "peer", peerIDStr, "removed_manifests", removed, "shard", shardLabel(newShard))
 	}
 	if isSiblingShard(lastShard, newShard) {
-		m.peerLastSiblingMove[peerIDStr] = siblingMoveRecord{from: lastShard, to: newShard, when: timestamp}
+		m.peerLastSiblingMove[peerIDStr] = siblingMoveRecord{when: timestamp}
 	}
 	return true
 }
@@ -320,7 +311,7 @@ func (m *Monitor) hasSplitEvent(parent, child string) bool {
 func (m *Monitor) getPinnedInShardForNode(peerIDStr, nodeShard string) int {
 	m.mu.RLock()
 	defer m.mu.RUnlock()
-	cutoff := time.Now().Add(-ReplicationAnnounceTTL)
+	cutoff := time.Now().Add(-replicationAnnounceTTL)
 	if m.peerShardLastSeen[peerIDStr] != nil {
 		if last := m.peerShardLastSeen[peerIDStr][nodeShard]; last.Before(cutoff) {
 			return 0
diff --git a/internal/monitor/monitor_handlers_test.go b/internal/monitor/monitor_handlers_test.go
index e1fee7f..3fb62db 100644
--- a/internal/monitor/monitor_handlers_test.go
+++ b/internal/monitor/monitor_handlers_test.go
@@ -6,7 +6,7 @@ import (
 )
 
 func TestUpdateNodeShardLocked_CrossBranchRejected(t *testing.T) {
-	m := NewMonitor(DefaultMonitorConfig(), "")
+	m := NewMonitor(DefaultMonitorConfig())
 	peerID := "12D3KooWTestCrossBranch123"
 	now := time.Now()
 
@@ -24,7 +24,7 @@ func TestUpdateNodeShardLocked_CrossBranchRejected(t *testing.T) {
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			node := &NodeState{
+			node := &nodeState{
 				PeerID:       peerID,
 				CurrentShard: tt.lastShard,
 				ShardHistory: []ShardHistoryEntry{{ShardID: tt.lastShard, FirstSeen: now}},
@@ -49,12 +49,12 @@ func TestUpdateNodeShardLocked_CrossBranchRejected(t *testing.T) {
 }
 
 func TestUpdateNodeShardLocked_ValidMovesAccepted(t *testing.T) {
-	m := NewMonitor(DefaultMonitorConfig(), "")
+	m := NewMonitor(DefaultMonitorConfig())
 	now := time.Now()
 
 	t.Run("split_0_to_00", func(t *testing.T) {
 		peerID := "12D3KooWTestSplit123"
-		node := &NodeState{
+		node := &nodeState{
 			PeerID:       peerID,
 			CurrentShard: "0",
 			ShardHistory: []ShardHistoryEntry{{ShardID: "0", FirstSeen: now}},
@@ -77,7 +77,7 @@ func TestUpdateNodeShardLocked_ValidMovesAccepted(t *testing.T) {
 
 	t.Run("sibling_0_to_1", func(t *testing.T) {
 		peerID := "12D3KooWTestSibling456"
-		node := &NodeState{
+		node := &nodeState{
 			PeerID:       peerID,
 			CurrentShard: "0",
 			ShardHistory: []ShardHistoryEntry{{ShardID: "0", FirstSeen: now}},
diff --git a/internal/monitor/monitor_libp2p.go b/internal/monitor/monitor_libp2p.go
index aa9c9a9..7e85fd9 100644
--- a/internal/monitor/monitor_libp2p.go
+++ b/internal/monitor/monitor_libp2p.go
@@ -123,16 +123,16 @@ func StartLibP2P(ctx context.Context, monitor *Monitor) (host.Host, error) {
 	wg.Wait()
 
 	routingDiscovery := routing.NewRoutingDiscovery(kademliaDHT)
-	dutil.Advertise(ctx, routingDiscovery, DiscoveryServiceTag)
-	slog.Info("advertising service", "tag", DiscoveryServiceTag)
+	dutil.Advertise(ctx, routingDiscovery, discoveryServiceTag)
+	slog.Info("advertising service", "tag", discoveryServiceTag)
 
 	notifee := &discovery.DiscoveryNotifee{H: h, Ctx: ctx}
-	mdnsSvc := mdns.NewMdnsService(h, DiscoveryServiceTag, notifee)
+	mdnsSvc := mdns.NewMdnsService(h, discoveryServiceTag, notifee)
 	if err := mdnsSvc.Start(); err != nil {
 		slog.Warn("mdns start failed", "error", err)
 	}
 
-	go discovery.RunPeerFinder(ctx, h, routingDiscovery, DiscoveryServiceTag)
+	go discovery.RunPeerFinder(ctx, h, routingDiscovery, discoveryServiceTag)
 	go runMeshMaintenance(ctx, h, kademliaDHT, routingDiscovery)
 
 	return h, nil
@@ -179,7 +179,7 @@ func runMeshMaintenance(ctx context.Context, h host.Host, kademliaDHT *dht.IpfsD
 			cancel()
 		}
 
-		dutil.Advertise(ctx, rd, DiscoveryServiceTag)
+		dutil.Advertise(ctx, rd, discoveryServiceTag)
 
 		slog.Info("mesh maintenance complete",
 			"connected_peers", connected,
diff --git a/internal/monitor/monitor_models.go b/internal/monitor/monitor_models.go
index 84f764a..05e2b44 100644
--- a/internal/monitor/monitor_models.go
+++ b/internal/monitor/monitor_models.go
@@ -3,18 +3,14 @@ package monitor
 
 import (
 	"context"
-	"log/slog"
-	"os"
-	"path/filepath"
 	"sort"
 	"sync"
 	"time"
 
+	"dlockss/internal/config"
+
 	pubsub "github.com/libp2p/go-libp2p-pubsub"
 	"github.com/libp2p/go-libp2p/core/host"
-	"github.com/oschwald/geoip2-golang"
-
-	"dlockss/internal/common"
 )
 
 // shardSub bundles a PubSub topic with its subscription so that
@@ -26,16 +22,15 @@ type shardSub struct {
 }
 
 const (
-	DiscoveryServiceTag          = "dlockss-prod"
+	discoveryServiceTag          = "dlockss-prod"
 	WebUIPort                    = 8080
-	DefaultBootstrapShardDepth   = 6  // Depth of shard tree to subscribe to on startup (covers late-join case)
-	MaxShardDepthForSubscription = 10 // Don't subscribe to shards deeper than this (avoids thousands of topics)
-	MaxShardDepthForTreeDisplay  = 8  // Prune tree display at this depth (avoids very deep chart)
-	DefaultNodeCleanupTimeout    = 350 * time.Second
-	ReplicationAnnounceTTL       = 350 * time.Second
-	MonitorMinReplication        = 5
-	MonitorMaxReplication        = 10
-	ReplicationCleanupEvery      = 1 * time.Minute
+	defaultBootstrapShardDepth   = 6
+	maxShardDepthForSubscription = 10
+	maxShardDepthForTreeDisplay  = 8
+	defaultNodeCleanupTimeout    = 350 * time.Second
+	replicationAnnounceTTL       = 350 * time.Second
+	monitorMinReplication        = 5
+	replicationCleanupEvery      = 1 * time.Minute
 	MonitorIdentityFile          = "monitor_identity.key"
 	siblingMoveCooldown          = 90 * time.Second // ignore sibling moves within this window (reduces 00↔01, 10↔11 oscillation; gossip-sub can delay 20–30s)
 	unpinGracePeriod             = 30 * time.Second // don't act on pinned=0 until this long after first discovery (avoids stale heartbeats)
@@ -50,28 +45,37 @@ type MonitorConfig struct {
 	TopicName           string
 }
 
-const DefaultPubsubTopicPrefix = "dlockss-v0.0.3"
-
-const DefaultTopicName = "creative-commons"
-
 func DefaultMonitorConfig() MonitorConfig {
 	return MonitorConfig{
-		NodeCleanupTimeout:  DefaultNodeCleanupTimeout,
-		BootstrapShardDepth: DefaultBootstrapShardDepth,
-		PubsubTopicPrefix:   DefaultPubsubTopicPrefix,
-		TopicName:           DefaultTopicName,
+		NodeCleanupTimeout:  defaultNodeCleanupTimeout,
+		BootstrapShardDepth: defaultBootstrapShardDepth,
+		PubsubTopicPrefix:   config.DefaultPubsubVersion,
+		TopicName:           config.DefaultTopicName,
 	}
 }
 
-type StatusResponse = common.StatusResponse
-type StorageStatus = common.StorageStatus
-type ReplicationStatus = common.ReplicationStatus
+// StatusResponse defines the JSON structure for monitor node views.
+type StatusResponse struct {
+	PeerID        string        `json:"peer_id"`
+	Version       string        `json:"version"`
+	CurrentShard  string        `json:"current_shard"`
+	Role          string        `json:"role,omitempty"`
+	PeersInShard  int           `json:"peers_in_shard"`
+	Storage       StorageStatus `json:"storage"`
+	UptimeSeconds float64       `json:"uptime_seconds"`
+}
 
-type NodeState struct {
+type StorageStatus struct {
+	PinnedFiles   int `json:"pinned_files"`
+	PinnedInShard int `json:"pinned_in_shard,omitempty"`
+	KnownFiles    int `json:"known_files"`
+}
+
+type nodeState struct {
 	PeerID         string              `json:"peer_id"`
 	NodeName       string              `json:"node_name,omitempty"`
 	CurrentShard   string              `json:"current_shard"`
-	Role           string              `json:"role,omitempty"` // ACTIVE, PASSIVE, REPLICATOR, or PROBE (empty = ACTIVE)
+	Role           string              `json:"role,omitempty"`
 	PinnedFiles    int                 `json:"pinned_files"`
 	KnownFiles     int                 `json:"known_files"`
 	LastSeen       time.Time           `json:"last_seen"`
@@ -106,10 +110,8 @@ type Monitor struct {
 	subCancel           context.CancelFunc // cancels subCtx
 	topicPrefixOverride string             // if set, overrides config.PubsubTopicPrefix for subscriptions
 	topicNameOverride   string             // if set, overrides config.TopicName for subscriptions
-	nodes               map[string]*NodeState
+	nodes               map[string]*nodeState
 	splitEvents         []ShardSplitEvent
-	geoDB               *geoip2.Reader // local GeoIP database; nil if not configured
-	geoCache            sync.Map       // IP → region string; cache for on-demand lookups
 	treeCache           *ShardTreeNode
 	treeCacheTime       time.Time
 	treeDirty           bool
@@ -123,17 +125,14 @@ type Monitor struct {
 	manifestShard       map[string]string // manifest CID → observed shard (from PINNED/IngestMessage announcements)
 	lastSplitTime       time.Time         // when we last detected a split; used to avoid pruning during mesh formation
 	peerLastSiblingMove map[string]siblingMoveRecord
-	done                chan struct{} // closed on shutdown to stop background goroutines
 }
 
 // siblingMoveRecord tracks the last sibling shard move for cooldown (reduces 0↔1 oscillation from stale messages).
 type siblingMoveRecord struct {
-	from string
-	to   string
 	when time.Time
 }
 
-func (n *NodeState) EffectiveShard() string {
+func (n *nodeState) EffectiveShard() string {
 	if n.CurrentShard != "" {
 		return n.CurrentShard
 	}
@@ -152,7 +151,7 @@ func shardLabel(shardID string) string {
 
 // isDisplayableNode returns false for PROBE nodes and the monitor itself.
 // ACTIVE, PASSIVE, and REPLICATOR nodes appear in the UI.
-func (m *Monitor) isDisplayableNodeUnlocked(peerID string, node *NodeState) bool {
+func (m *Monitor) isDisplayableNodeUnlocked(peerID string, node *nodeState) bool {
 	if node.Role == "PROBE" {
 		return false
 	}
@@ -192,46 +191,31 @@ func (m *Monitor) getTopicNameUnlocked() string {
 	return m.cfg.TopicName
 }
 
-// CIDEntry is a manifest CID with its observed shard and replica count.
-// Used by node-files, unique-cids, and replication-cids API responses.
-type CIDEntry struct {
+type cidEntry struct {
 	CID      string `json:"cid"`
 	Shard    string `json:"shard"`
 	Replicas int    `json:"replicas"`
 }
 
-// buildCIDEntries returns sorted CIDEntries for the given CID→time map.
-// Caller must hold m.mu at least as RLock.
-func (m *Monitor) buildCIDEntriesUnlocked(cids map[string]time.Time) []CIDEntry {
-	entries := make([]CIDEntry, 0, len(cids))
+func (m *Monitor) buildCIDEntriesUnlocked(cids map[string]time.Time) []cidEntry {
+	entries := make([]cidEntry, 0, len(cids))
 	for cidStr := range cids {
 		replicas := 0
 		if peers, ok := m.manifestReplication[cidStr]; ok {
 			replicas = len(peers)
 		}
 		shard := m.manifestShard[cidStr]
-		entries = append(entries, CIDEntry{CID: cidStr, Shard: shard, Replicas: replicas})
+		entries = append(entries, cidEntry{CID: cidStr, Shard: shard, Replicas: replicas})
 	}
 	sort.Slice(entries, func(i, j int) bool { return entries[i].CID < entries[j].CID })
 	return entries
 }
 
-func monitorDataDir() string {
-	homeDir, err := os.UserHomeDir()
-	if err != nil {
-		return ""
-	}
-	dir := filepath.Join(homeDir, ".dlockss-monitor")
-	os.MkdirAll(dir, 0700)
-	return dir
-}
-
-func NewMonitor(cfg MonitorConfig, geoDBPath string) *Monitor {
+func NewMonitor(cfg MonitorConfig) *Monitor {
 	m := &Monitor{
 		cfg:                 cfg,
-		nodes:               make(map[string]*NodeState),
+		nodes:               make(map[string]*nodeState),
 		splitEvents:         make([]ShardSplitEvent, 0, 100),
-		geoDB:               openGeoIPDB(geoDBPath),
 		uniqueCIDs:          make(map[string]time.Time),
 		shardTopics:         make(map[string]*shardSub),
 		nodeFiles:           make(map[string]map[string]time.Time),
@@ -239,12 +223,6 @@ func NewMonitor(cfg MonitorConfig, geoDBPath string) *Monitor {
 		peerShardLastSeen:   make(map[string]map[string]time.Time),
 		manifestShard:       make(map[string]string),
 		peerLastSiblingMove: make(map[string]siblingMoveRecord),
-		done:                make(chan struct{}),
-	}
-	if m.geoDB != nil {
-		slog.Info("geoip mode", "source", "local database")
-	} else {
-		slog.Info("geoip mode", "source", "ip-api.com")
 	}
 	go m.runReplicationCleanup()
 	return m
diff --git a/internal/monitor/monitor_replication.go b/internal/monitor/monitor_replication.go
index a5aeb1a..a6aa761 100644
--- a/internal/monitor/monitor_replication.go
+++ b/internal/monitor/monitor_replication.go
@@ -36,7 +36,7 @@ func (m *Monitor) newReplicationSnapshotUnlocked() replicationSnapshot {
 	return replicationSnapshot{
 		shardPeerCount: spc,
 		depth:          depth,
-		cutoff:         time.Now().Add(-ReplicationAnnounceTTL),
+		cutoff:         time.Now().Add(-replicationAnnounceTTL),
 		m:              m,
 	}
 }
@@ -73,7 +73,7 @@ func (rs *replicationSnapshot) buildShardCounts(peers map[string]time.Time) map[
 func (rs *replicationSnapshot) resolveManifest(manifest string, peers map[string]time.Time, shardCounts map[string]int) manifestResult {
 	targetShard := rs.m.manifestShard[manifest]
 	if targetShard == "" || rs.shardPeerCount[targetShard] == 0 {
-		targetShard, _ = effectiveTargetShardForManifest(manifest, rs.depth, rs.shardPeerCount)
+		targetShard = effectiveTargetShardForManifest(manifest, rs.depth, rs.shardPeerCount)
 	}
 
 	count := shardCounts[targetShard]
@@ -90,7 +90,7 @@ func (rs *replicationSnapshot) resolveManifest(manifest string, peers map[string
 
 	// Sibling aggregation: replicas split across children of the same parent.
 	if count > 0 && len(targetShard) >= 1 {
-		minRep := MonitorMinReplication
+		minRep := monitorMinReplication
 		if maxRep > 0 && minRep > maxRep {
 			minRep = maxRep
 		}
@@ -127,7 +127,7 @@ func (rs *replicationSnapshot) resolveManifest(manifest string, peers map[string
 
 // isAtTarget returns true if count is within the replication target range.
 func (mr manifestResult) isAtTarget() bool {
-	minRep := MonitorMinReplication
+	minRep := monitorMinReplication
 	if mr.maxRep > 0 && minRep > mr.maxRep {
 		minRep = mr.maxRep
 	}
@@ -135,16 +135,12 @@ func (mr manifestResult) isAtTarget() bool {
 }
 
 func (m *Monitor) runReplicationCleanup() {
-	ticker := time.NewTicker(ReplicationCleanupEvery)
+	ticker := time.NewTicker(replicationCleanupEvery)
 	defer ticker.Stop()
 	for {
-		select {
-		case <-m.done:
-			return
-		case <-ticker.C:
-		}
+		<-ticker.C
 		m.mu.Lock()
-		cutoff := time.Now().Add(-ReplicationAnnounceTTL)
+		cutoff := time.Now().Add(-replicationAnnounceTTL)
 		for cid, peers := range m.manifestReplication {
 			for peerID, lastSeen := range peers {
 				if lastSeen.Before(cutoff) {
@@ -180,7 +176,7 @@ func (m *Monitor) runReplicationCleanup() {
 		}
 		slog.Info("replication snapshot", "total_nodes", totalNodes, "total_manifests", totalFiles, "total_at_target", filesAtTarget, "avg_replication", fmt.Sprintf("%.2f", avgLevel), "shards", strings.TrimSpace(b.String()))
 		if filesAtTarget == 0 && totalFiles > 0 && totalNodes > 0 {
-			slog.Warn("total at target is zero", "replication_announce_ttl", ReplicationAnnounceTTL)
+			slog.Warn("total at target is zero", "replication_announce_ttl", replicationAnnounceTTL)
 		}
 		if totalFiles == 0 && totalNodes > 0 {
 			m.mu.RLock()
@@ -193,27 +189,15 @@ func (m *Monitor) runReplicationCleanup() {
 	}
 }
 
-func targetShardForManifest(manifestCIDStr string, depth int) string {
-	if depth <= 0 {
-		return ""
-	}
-	hexStr := common.KeyToStableHex(manifestCIDStr)
-	prefix, err := common.GetHexBinaryPrefix(hexStr, depth)
-	if err != nil {
-		return ""
-	}
-	return prefix
-}
-
-func effectiveTargetShardForManifest(manifestCIDStr string, depth int, shardPeerCount map[string]int) (targetShard string, maxRep int) {
+func effectiveTargetShardForManifest(manifestCIDStr string, depth int, shardPeerCount map[string]int) string {
 	for d := depth; d >= 0; d-- {
-		shard := targetShardForManifest(manifestCIDStr, d)
-		n := shardPeerCount[shard]
-		if n > 0 {
-			return shard, n
+		shard, err := common.TargetShardForPayload(manifestCIDStr, d)
+		if err != nil || shardPeerCount[shard] == 0 {
+			continue
 		}
+		return shard
 	}
-	return "", 0
+	return ""
 }
 
 func shardWithMostReplicas(shardCounts map[string]int, shardPeerCount map[string]int) string {
@@ -244,26 +228,6 @@ func sumDescendantReplicasAndNodes(manifestShardCounts map[string]int, shardPeer
 	return totalReplicas, totalNodes
 }
 
-func (m *Monitor) replicationNetworkDepth() int {
-	m.mu.RLock()
-	defer m.mu.RUnlock()
-	return m.replicationNetworkDepthUnlocked()
-}
-
-func (m *Monitor) replicationNetworkDepthUnlocked() int {
-	maxLen := 0
-	for id, node := range m.nodes {
-		if !m.isDisplayableNodeUnlocked(id, node) {
-			continue
-		}
-		shard := node.EffectiveShard()
-		if len(shard) > maxLen {
-			maxLen = len(shard)
-		}
-	}
-	return maxLen
-}
-
 func (m *Monitor) getReplicationStats() (distribution [11]int, avgLevel float64, filesAtTarget int) {
 	m.mu.RLock()
 	defer m.mu.RUnlock()
@@ -297,7 +261,7 @@ func (m *Monitor) getReplicationStats() (distribution [11]int, avgLevel float64,
 	return distribution, avgLevel, filesAtTarget
 }
 
-func (m *Monitor) getReplicationCIDsByLevel(level int) []CIDEntry {
+func (m *Monitor) getReplicationCIDsByLevel(level int) []cidEntry {
 	if level < 0 || level > 10 {
 		return nil
 	}
@@ -305,7 +269,7 @@ func (m *Monitor) getReplicationCIDsByLevel(level int) []CIDEntry {
 	defer m.mu.RUnlock()
 
 	rs := m.newReplicationSnapshotUnlocked()
-	var result []CIDEntry
+	var result []cidEntry
 
 	for cid, peers := range m.manifestReplication {
 		if len(peers) == 0 {
@@ -318,7 +282,7 @@ func (m *Monitor) getReplicationCIDsByLevel(level int) []CIDEntry {
 		}
 		matches := (level == 10 && mr.count >= 10) || (level < 10 && mr.count == level)
 		if matches {
-			result = append(result, CIDEntry{CID: cid, Shard: shardLabel(m.manifestShard[cid]), Replicas: mr.count})
+			result = append(result, cidEntry{CID: cid, Shard: shardLabel(m.manifestShard[cid]), Replicas: mr.count})
 		}
 	}
 	sort.Slice(result, func(i, j int) bool { return result[i].CID < result[j].CID })
diff --git a/internal/monitor/monitor_routes.go b/internal/monitor/monitor_routes.go
index 665d055..1c6f73b 100644
--- a/internal/monitor/monitor_routes.go
+++ b/internal/monitor/monitor_routes.go
@@ -50,7 +50,7 @@ type nodeSnap struct {
 	pinnedFiles   int
 }
 
-func (m *Monitor) snapshotNodes(filter func(id string, node *NodeState, shard string) bool) []nodeSnap {
+func (m *Monitor) snapshotNodes(filter func(id string, node *nodeState, shard string) bool) []nodeSnap {
 	m.PruneStaleNodes()
 	m.mu.RLock()
 	defer m.mu.RUnlock()
@@ -109,8 +109,7 @@ func (m *Monitor) buildNodeResponse(snapshot []nodeSnap) map[string]interface{}
 			CurrentShard:  s.currentShard,
 			Role:          s.role,
 			PeersInShard:  s.peersInShard,
-			Storage:       StorageStatus{PinnedFiles: s.pinnedFiles, PinnedInShard: pinnedInShard, KnownFiles: s.knownFiles, KnownCIDs: []string{}},
-			Replication:   ReplicationStatus{},
+			Storage:       StorageStatus{PinnedFiles: s.pinnedFiles, PinnedInShard: pinnedInShard, KnownFiles: s.knownFiles},
 			UptimeSeconds: s.uptimeSeconds,
 		}
 		response[s.id] = map[string]interface{}{
@@ -124,7 +123,7 @@ func (m *Monitor) buildNodeResponse(snapshot []nodeSnap) map[string]interface{}
 
 func (m *Monitor) handleNodes(w http.ResponseWriter, r *http.Request) {
 	query := strings.ToLower(r.URL.Query().Get("q"))
-	snapshot := m.snapshotNodes(func(id string, node *NodeState, _ string) bool {
+	snapshot := m.snapshotNodes(func(id string, node *nodeState, _ string) bool {
 		if query == "" {
 			return true
 		}
@@ -143,7 +142,7 @@ func (m *Monitor) handleShardTree(w http.ResponseWriter, r *http.Request) {
 
 func (m *Monitor) handleShardNodes(w http.ResponseWriter, r *http.Request) {
 	shardFilter := r.URL.Query().Get("shard")
-	snapshot := m.snapshotNodes(func(_ string, _ *NodeState, shard string) bool {
+	snapshot := m.snapshotNodes(func(_ string, _ *nodeState, shard string) bool {
 		return shard == shardFilter
 	})
 	response := m.buildNodeResponse(snapshot)
@@ -190,11 +189,11 @@ func (m *Monitor) handleNodeFiles(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 	m.mu.RLock()
-	var entries []CIDEntry
+	var entries []cidEntry
 	if files, ok := m.nodeFiles[peerID]; ok {
 		entries = m.buildCIDEntriesUnlocked(files)
 	} else {
-		entries = []CIDEntry{}
+		entries = []cidEntry{}
 	}
 	m.mu.RUnlock()
 	writeJSON(w, map[string]interface{}{"peer_id": peerID, "cids": entries, "count": len(entries)})
@@ -329,8 +328,6 @@ func (m *Monitor) handleIdentify(w http.ResponseWriter, r *http.Request) {
 		protoStrs = append(protoStrs, string(p))
 	}
 
-	region := m.resolveRegionFromAddrs(m.host.Peerstore().Addrs(pid))
-
 	result := map[string]interface{}{
 		"peer_id":          pid.String(),
 		"agent_version":    fmt.Sprintf("%v", agentVersion),
@@ -338,7 +335,6 @@ func (m *Monitor) handleIdentify(w http.ResponseWriter, r *http.Request) {
 		"protocols":        protoStrs,
 		"addresses":        addrStrs,
 		"connected":        connected,
-		"region":           region,
 	}
 	writeJSON(w, result)
 }
@@ -387,9 +383,6 @@ func (m *Monitor) RunStatusLogger(ctx context.Context) {
 	}
 }
 
-// Close releases resources held by the monitor (GeoIP database, etc.).
+// Close releases resources held by the monitor.
 func (m *Monitor) Close() {
-	if m.geoDB != nil {
-		m.geoDB.Close()
-	}
 }
diff --git a/internal/monitor/monitor_subscription.go b/internal/monitor/monitor_subscription.go
index e2a573f..89aaac3 100644
--- a/internal/monitor/monitor_subscription.go
+++ b/internal/monitor/monitor_subscription.go
@@ -5,6 +5,7 @@ import (
 	"errors"
 	"fmt"
 	"log/slog"
+	"net"
 	"strings"
 	"time"
 
@@ -69,7 +70,7 @@ func (m *Monitor) ensureShardSubscriptionUnlocked(ctx context.Context, shardID s
 	if m.ps == nil {
 		return
 	}
-	if len(shardID) > MaxShardDepthForSubscription {
+	if len(shardID) > maxShardDepthForSubscription {
 		return // Avoid subscribing to very deep shards (e.g. 16-bit IDs)
 	}
 	if _, exists := m.shardTopics[shardID]; exists {
@@ -177,14 +178,14 @@ func (m *Monitor) handleShardMessages(ctx context.Context, sub *pubsub.Subscript
 			m.dispatchJoin(ctx, data[5:], senderID, shardID, ip)
 
 		case hasPrefix(data, "PINNED:"):
-			m.handleHeartbeat(ctx, senderID, shardID, ip, -1)
+			m.handleHeartbeatWithRole(ctx, senderID, shardID, ip, -1, "", "")
 			if manifestCID, err := cid.Decode(string(data[7:])); err == nil {
 				im := schema.IngestMessage{SignedEnvelope: schema.SignedEnvelope{ManifestCID: manifestCID}, ShardID: shardID}
 				m.handleIngestMessage(ctx, &im, senderID, shardID, ip)
 			}
 
 		default:
-			m.handleHeartbeat(ctx, senderID, shardID, ip, -1)
+			m.handleHeartbeatWithRole(ctx, senderID, shardID, ip, -1, "", "")
 			var im schema.IngestMessage
 			if err := im.UnmarshalCBOR(data); err == nil {
 				m.dispatchIngestMessage(ctx, &im, senderID, shardID, ip)
@@ -204,7 +205,7 @@ func (m *Monitor) dispatchHeartbeat(ctx context.Context, data []byte, senderID p
 	}
 	parts := strings.SplitN(string(data), ":", 5)
 	if len(parts) < 2 || parts[1] == "" {
-		m.handleHeartbeat(ctx, senderID, shardID, ip, -1)
+		m.handleHeartbeatWithRole(ctx, senderID, shardID, ip, -1, "", "")
 		return
 	}
 
@@ -285,7 +286,7 @@ func (m *Monitor) dispatchIngestMessage(ctx context.Context, im *schema.IngestMe
 	}
 	m.handleIngestMessage(ctx, im, authorID, targetShard, ip)
 	m.ensureMinPinnedForPeer(ctx, authorID.String(), 1)
-	m.handleHeartbeat(ctx, authorID, targetShard, ip, -1)
+	m.handleHeartbeatWithRole(ctx, authorID, targetShard, ip, -1, "", "")
 }
 
 // subscribeToActiveShards runs in the background and periodically subscribes
@@ -310,7 +311,7 @@ func (m *Monitor) subscribeToActiveShardsPass(ctx context.Context) {
 	}
 	targets := m.collectShardTargets()
 	for shardID := range targets {
-		if len(shardID) <= MaxShardDepthForSubscription {
+		if len(shardID) <= maxShardDepthForSubscription {
 			m.ensureShardSubscription(ctx, shardID)
 		}
 	}
@@ -344,8 +345,12 @@ func (m *Monitor) collectShardTargets() map[string]bool {
 			targets["1"] = true
 		}
 	}
-	// Add children of every known shard so we catch imminent splits.
-	for shardID := range copyKeys(targets) {
+	// Snapshot keys so we can mutate targets while iterating.
+	existing := make([]string, 0, len(targets))
+	for k := range targets {
+		existing = append(existing, k)
+	}
+	for _, shardID := range existing {
 		c0, c1 := shardID+"0", shardID+"1"
 		if shardID == "" {
 			c0, c1 = "0", "1"
@@ -356,14 +361,6 @@ func (m *Monitor) collectShardTargets() map[string]bool {
 	return targets
 }
 
-func copyKeys(m map[string]bool) map[string]bool {
-	out := make(map[string]bool, len(m))
-	for k, v := range m {
-		out[k] = v
-	}
-	return out
-}
-
 // closeAllShardSubsUnlocked tears down the current subscription generation:
 // cancels the generation context (killing all goroutines immediately), then
 // cancels each subscription and closes the underlying topic. A fresh
@@ -384,7 +381,7 @@ func (m *Monitor) closeAllShardSubsUnlocked() {
 // clearNodeStateUnlocked resets all per-network state maps so the monitor
 // starts fresh after a topic switch. Caller must hold m.mu.
 func (m *Monitor) clearNodeStateUnlocked() {
-	m.nodes = make(map[string]*NodeState)
+	m.nodes = make(map[string]*nodeState)
 	m.splitEvents = m.splitEvents[:0]
 	m.uniqueCIDs = make(map[string]time.Time)
 	m.manifestReplication = make(map[string]map[string]time.Time)
@@ -435,3 +432,37 @@ func (m *Monitor) SwitchTopic(_ context.Context, newTopic string) {
 	m.resubscribeBootstrap()
 	slog.Info("switched topic name", "topic", effectiveTopic, "shards", 1<<(m.cfg.BootstrapShardDepth+1)-1)
 }
+
+func isPrivateIP(ipStr string) bool {
+	ip := net.ParseIP(ipStr)
+	if ip == nil {
+		return false
+	}
+	if ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() {
+		return true
+	}
+	privateIPBlocks := []string{"10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"}
+	for _, cidr := range privateIPBlocks {
+		_, block, _ := net.ParseCIDR(cidr)
+		if block.Contains(ip) {
+			return true
+		}
+	}
+	return false
+}
+
+func preferPublicIP(ips []string) string {
+	var fallback string
+	for _, ip := range ips {
+		if ip == "" {
+			continue
+		}
+		if fallback == "" {
+			fallback = ip
+		}
+		if !isPrivateIP(ip) {
+			return ip
+		}
+	}
+	return fallback
+}
diff --git a/internal/monitor/monitor_tree.go b/internal/monitor/monitor_tree.go
index 0b8eb1e..e3c1664 100644
--- a/internal/monitor/monitor_tree.go
+++ b/internal/monitor/monitor_tree.go
@@ -15,7 +15,7 @@ func (m *Monitor) GetShardTree() *ShardTreeNode {
 
 	rawShardIDs := make(map[string]bool)
 	rawShardIDs[""] = true
-	maxDepth := MaxShardDepthForTreeDisplay
+	maxDepth := maxShardDepthForTreeDisplay
 	shardCounts := make(map[string]int)
 	for id, n := range m.nodes {
 		if !m.isDisplayableNodeUnlocked(id, n) {
diff --git a/internal/monitor/static/dashboard.html b/internal/monitor/static/dashboard.html
index eeae596..72091a0 100644
--- a/internal/monitor/static/dashboard.html
+++ b/internal/monitor/static/dashboard.html
@@ -355,11 +355,9 @@ <h3 style="margin:0; text-transform:uppercase; font-size:1em;">Network Nodes</h3
                 const connected = data.connected ? '<span style="color:#4ecdc4;">connected</span>' : '<span style="color:#ff6b6b;">not connected</span>';
                 const addrs = (data.addresses || []).map(a => '<div style="padding:2px 0; font-size:0.85em;">' + escapeHtml(a) + '</div>').join('') || '<span style="color:#666;">none</span>';
                 const protos = (data.protocols || []).map(p => '<span style="display:inline-block; background:#ccc; border:1px solid #444; border-radius:3px; padding:1px 6px; margin:2px; font-size:0.8em;">' + escapeHtml(p) + '</span>').join('') || '<span style="color:#666;">none</span>';
-                const regionVal = data.region ? escapeHtml(data.region) : '<span style="color:#999;">unknown</span>';
                 c.innerHTML = '<table style="width:100%; border-collapse:collapse;">' +
                     '<tr><td style="padding:6px 8px; color:#999; width:130px; vertical-align:top;">Peer ID</td><td style="padding:6px 8px; word-break:break-all;">' + escapeHtml(data.peer_id || peerId) + '</td></tr>' +
                     '<tr><td style="padding:6px 8px; color:#999; vertical-align:top;">Status</td><td style="padding:6px 8px;">' + connected + '</td></tr>' +
-                    '<tr><td style="padding:6px 8px; color:#999; vertical-align:top;">Region</td><td style="padding:6px 8px;">' + regionVal + '</td></tr>' +
                     '<tr><td style="padding:6px 8px; color:#999; vertical-align:top;">Agent</td><td style="padding:6px 8px;">' + escapeHtml(data.agent_version || '-') + '</td></tr>' +
                     '<tr><td style="padding:6px 8px; color:#999; vertical-align:top;">Protocol</td><td style="padding:6px 8px;">' + escapeHtml(data.protocol_version || '-') + '</td></tr>' +
                     '<tr><td style="padding:6px 8px; color:#999; vertical-align:top;">Addresses</td><td style="padding:6px 8px;">' + addrs + '</td></tr>' +
diff --git a/internal/signing/nonce.go b/internal/signing/nonce.go
index ab7a114..7722bc3 100644
--- a/internal/signing/nonce.go
+++ b/internal/signing/nonce.go
@@ -1,6 +1,7 @@
 package signing
 
 import (
+	"crypto/rand"
 	"encoding/hex"
 	"sync"
 	"time"
@@ -8,6 +9,12 @@ import (
 	"github.com/libp2p/go-libp2p/core/peer"
 )
 
+func newNonce(n int) ([]byte, error) {
+	b := make([]byte, n)
+	_, err := rand.Read(b)
+	return b, err
+}
+
 func nonceKey(sender peer.ID, nonce []byte) string {
 	return sender.String() + ":" + hex.EncodeToString(nonce)
 }
diff --git a/internal/signing/nonce_test.go b/internal/signing/nonce_test.go
index 16a797c..b273dc9 100644
--- a/internal/signing/nonce_test.go
+++ b/internal/signing/nonce_test.go
@@ -4,46 +4,45 @@ import (
 	"testing"
 	"time"
 
-	"dlockss/internal/common"
 	"dlockss/internal/testutil"
 )
 
 func TestNewNonce_ReturnsCorrectLength(t *testing.T) {
 	for _, size := range []int{1, 16, 32, 64} {
-		nonce, err := common.NewNonce(size)
+		nonce, err := newNonce(size)
 		if err != nil {
-			t.Fatalf("NewNonce(%d): %v", size, err)
+			t.Fatalf("newNonce(%d): %v", size, err)
 		}
 		if len(nonce) != size {
-			t.Errorf("NewNonce(%d) len=%d, want %d", size, len(nonce), size)
+			t.Errorf("newNonce(%d) len=%d, want %d", size, len(nonce), size)
 		}
 	}
 }
 
 func TestNewNonce_TwoCallsProduceDifferentValues(t *testing.T) {
-	n1, err := common.NewNonce(32)
+	n1, err := newNonce(32)
 	if err != nil {
-		t.Fatalf("NewNonce: %v", err)
+		t.Fatalf("newNonce: %v", err)
 	}
-	n2, err := common.NewNonce(32)
+	n2, err := newNonce(32)
 	if err != nil {
-		t.Fatalf("NewNonce: %v", err)
+		t.Fatalf("newNonce: %v", err)
 	}
 	if string(n1) == string(n2) {
-		t.Error("two NewNonce calls produced identical values")
+		t.Error("two newNonce calls produced identical values")
 	}
 }
 
 func TestNewNonce_ZeroSizeWorks(t *testing.T) {
-	nonce, err := common.NewNonce(0)
+	nonce, err := newNonce(0)
 	if err != nil {
-		t.Fatalf("NewNonce(0): %v", err)
+		t.Fatalf("newNonce(0): %v", err)
 	}
 	if nonce == nil {
-		t.Error("NewNonce(0) returned nil")
+		t.Error("newNonce(0) returned nil")
 	}
 	if len(nonce) != 0 {
-		t.Errorf("NewNonce(0) len=%d, want 0", len(nonce))
+		t.Errorf("newNonce(0) len=%d, want 0", len(nonce))
 	}
 }
 
@@ -66,9 +65,9 @@ func TestSeenBefore_RejectsSameNonceTwice(t *testing.T) {
 func TestSeenBefore_AllowsFreshNonce(t *testing.T) {
 	ns := newNonceStore()
 	pid := testutil.MustPeerID(t, "sender-2")
-	nonce, err := common.NewNonce(16)
+	nonce, err := newNonce(16)
 	if err != nil {
-		t.Fatalf("NewNonce: %v", err)
+		t.Fatalf("newNonce: %v", err)
 	}
 	ttl := 10 * time.Minute
 
diff --git a/internal/signing/signing.go b/internal/signing/signing.go
index a004d94..9e73d38 100644
--- a/internal/signing/signing.go
+++ b/internal/signing/signing.go
@@ -24,7 +24,7 @@ const maxNonceSize = 64
 const minNonceSizeFloor = 1
 
 func (s *Signer) effectiveMinNonceSize() int {
-	n := s.cfg.MinNonceSize
+	n := s.cfg.Security.MinNonceSize
 	if n < minNonceSizeFloor {
 		n = minNonceSizeFloor
 	}
@@ -35,7 +35,7 @@ func (s *Signer) effectiveMinNonceSize() int {
 }
 
 func (s *Signer) effectiveNonceSizeForSigning() int {
-	n := s.cfg.NonceSize
+	n := s.cfg.Security.NonceSize
 	minSize := s.effectiveMinNonceSize()
 	if n < minSize {
 		n = minSize
@@ -47,8 +47,8 @@ func (s *Signer) effectiveNonceSizeForSigning() int {
 }
 
 func (s *Signer) effectiveSignatureMaxAge() time.Duration {
-	if s.cfg.SignatureMaxAge > 0 {
-		return s.cfg.SignatureMaxAge
+	if s.cfg.Security.SignatureMaxAge > 0 {
+		return s.cfg.Security.SignatureMaxAge
 	}
 	return 10 * time.Minute
 }
@@ -56,7 +56,7 @@ func (s *Signer) effectiveSignatureMaxAge() time.Duration {
 const maxFutureSkewCap = 5 * time.Minute
 
 func (s *Signer) effectiveFutureSkewTolerance() time.Duration {
-	d := s.cfg.FutureSkewTolerance
+	d := s.cfg.Security.FutureSkewTolerance
 	if d <= 0 {
 		return 30 * time.Second
 	}
@@ -67,7 +67,7 @@ func (s *Signer) effectiveFutureSkewTolerance() time.Duration {
 }
 
 func (s *Signer) effectiveNonceTTL() time.Duration {
-	ttl := s.cfg.SignatureMaxAge
+	ttl := s.cfg.Security.SignatureMaxAge
 	if ttl <= 0 {
 		ttl = 10 * time.Minute
 	}
@@ -107,11 +107,11 @@ func NewSigner(cfg SignerConfig) *Signer {
 }
 
 func (s *Signer) shouldEnforceSignatures() bool {
-	return s.cfg.SignatureMode == "strict" ||
-		(s.cfg.SignatureMode != "off" && s.cfg.SignatureMode != "warn")
+	return s.cfg.Security.SignatureMode == "strict" ||
+		(s.cfg.Security.SignatureMode != "off" && s.cfg.Security.SignatureMode != "warn")
 }
-func (s *Signer) shouldWarnOnBadSignatures() bool { return s.cfg.SignatureMode == "warn" }
-func (s *Signer) signaturesDisabled() bool        { return s.cfg.SignatureMode == "off" }
+func (s *Signer) shouldWarnOnBadSignatures() bool { return s.cfg.Security.SignatureMode == "warn" }
+func (s *Signer) signaturesDisabled() bool        { return s.cfg.Security.SignatureMode == "off" }
 
 func (s *Signer) signMessageEnvelope(marshalForSigning func() ([]byte, error), setSig func([]byte)) error {
 	if s.privKey == nil {
@@ -134,7 +134,7 @@ func (s *Signer) SignProtocolMessage(msg schema.Signable) error {
 	if msg == nil {
 		return fmt.Errorf("message is nil")
 	}
-	nonce, err := common.NewNonce(s.effectiveNonceSizeForSigning())
+	nonce, err := newNonce(s.effectiveNonceSizeForSigning())
 	if err != nil {
 		return err
 	}
@@ -153,6 +153,41 @@ func (s *Signer) verifySignedMessage(receivedFrom peer.ID, sender peer.ID, ts in
 	if s.signaturesDisabled() {
 		return nil
 	}
+	if err := s.validateMessageFields(receivedFrom, sender, ts, nonce, sig, unsigned); err != nil {
+		return err
+	}
+
+	maxAge := s.effectiveSignatureMaxAge()
+	now := time.Now()
+	msgTime := time.Unix(ts, 0)
+
+	pk := s.h.Peerstore().PubKey(sender)
+	if pk == nil {
+		pk, now = s.fetchPublicKey(sender)
+		if pk == nil {
+			return fmt.Errorf("missing public key for sender %s", sender.String())
+		}
+	}
+
+	if err := s.checkTimestamp(msgTime, now, maxAge); err != nil {
+		return err
+	}
+	if err := verifySignatureBytes(pk, unsigned, sig); err != nil {
+		return err
+	}
+
+	if s.nonces == nil {
+		return fmt.Errorf("nonce store missing")
+	}
+	nonceSnapshot := make([]byte, len(nonce))
+	copy(nonceSnapshot, nonce)
+	if s.nonces.seenBefore(sender, nonceSnapshot, s.effectiveNonceTTL()) {
+		return errReplay
+	}
+	return nil
+}
+
+func (s *Signer) validateMessageFields(receivedFrom, sender peer.ID, ts int64, nonce, sig, unsigned []byte) error {
 	if s.h == nil {
 		return fmt.Errorf("signer host is nil")
 	}
@@ -168,15 +203,6 @@ func (s *Signer) verifySignedMessage(receivedFrom peer.ID, sender peer.ID, ts in
 	if ts == 0 {
 		return fmt.Errorf("missing timestamp")
 	}
-	maxAge := s.effectiveSignatureMaxAge()
-	now := time.Now()
-	msgTime := time.Unix(ts, 0)
-	if msgTime.After(now.Add(s.effectiveFutureSkewTolerance())) {
-		return fmt.Errorf("timestamp too far in future: %v", msgTime)
-	}
-	if now.Sub(msgTime) > maxAge {
-		return fmt.Errorf("message too old: age=%v", now.Sub(msgTime))
-	}
 	if len(nonce) < s.effectiveMinNonceSize() {
 		return fmt.Errorf("nonce too short")
 	}
@@ -189,53 +215,46 @@ func (s *Signer) verifySignedMessage(receivedFrom peer.ID, sender peer.ID, ts in
 	if len(unsigned) == 0 {
 		return fmt.Errorf("empty message for verification")
 	}
+	return nil
+}
 
-	pk := s.h.Peerstore().PubKey(sender)
-	if pk == nil {
-		if s.h.Network().Connectedness(sender) != network.Connected {
-			ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-			defer cancel()
-			addrs := s.h.Peerstore().Addrs(sender)
-			if len(addrs) == 0 && s.dht != nil {
-				addrInfo, err := s.dht.FindPeer(ctx, sender)
-				if err == nil {
-					s.h.Peerstore().AddAddrs(addrInfo.ID, addrInfo.Addrs, 10*time.Minute)
-					addrs = addrInfo.Addrs
-				}
-			}
-			if len(addrs) > 0 {
-				_ = s.h.Connect(ctx, peer.AddrInfo{ID: sender, Addrs: addrs})
+func (s *Signer) checkTimestamp(msgTime, now time.Time, maxAge time.Duration) error {
+	if msgTime.After(now.Add(s.effectiveFutureSkewTolerance())) {
+		return fmt.Errorf("timestamp too far in future: %v", msgTime)
+	}
+	if now.Sub(msgTime) > maxAge {
+		return fmt.Errorf("message too old: age=%v", now.Sub(msgTime))
+	}
+	return nil
+}
+
+func (s *Signer) fetchPublicKey(sender peer.ID) (crypto.PubKey, time.Time) {
+	if s.h.Network().Connectedness(sender) != network.Connected {
+		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
+		defer cancel()
+		addrs := s.h.Peerstore().Addrs(sender)
+		if len(addrs) == 0 && s.dht != nil {
+			addrInfo, err := s.dht.FindPeer(ctx, sender)
+			if err == nil {
+				s.h.Peerstore().AddAddrs(addrInfo.ID, addrInfo.Addrs, 10*time.Minute)
+				addrs = addrInfo.Addrs
 			}
 		}
-		pk = s.h.Peerstore().PubKey(sender)
-		if pk == nil {
-			return fmt.Errorf("missing public key for sender %s", sender.String())
-		}
-		now = time.Now()
-		if msgTime.After(now.Add(s.effectiveFutureSkewTolerance())) {
-			return fmt.Errorf("timestamp too far in future after key fetch: %v", msgTime)
-		}
-		if now.Sub(msgTime) > maxAge {
-			return fmt.Errorf("message too old after key fetch: age=%v", now.Sub(msgTime))
+		if len(addrs) > 0 {
+			_ = s.h.Connect(ctx, peer.AddrInfo{ID: sender, Addrs: addrs})
 		}
 	}
+	return s.h.Peerstore().PubKey(sender), time.Now()
+}
 
-	ok, err := pk.Verify(unsigned, sig)
+func verifySignatureBytes(pk crypto.PubKey, payload, sig []byte) error {
+	ok, err := pk.Verify(payload, sig)
 	if err != nil {
 		return fmt.Errorf("signature verify error: %w", err)
 	}
 	if !ok {
 		return fmt.Errorf("invalid signature")
 	}
-
-	if s.nonces == nil {
-		return fmt.Errorf("nonce store missing")
-	}
-	nonceSnapshot := make([]byte, len(nonce))
-	copy(nonceSnapshot, nonce)
-	if s.nonces.seenBefore(sender, nonceSnapshot, s.effectiveNonceTTL()) {
-		return errReplay
-	}
 	return nil
 }
 
diff --git a/internal/syncmap/syncmap.go b/internal/syncmap/syncmap.go
index f8a5c2a..bf5ba82 100644
--- a/internal/syncmap/syncmap.go
+++ b/internal/syncmap/syncmap.go
@@ -20,12 +20,6 @@ func (s *Map[K, V]) Get(key K) (V, bool) {
 	return v, ok
 }
 
-func (s *Map[K, V]) Set(key K, val V) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	s.m[key] = val
-}
-
 // Upsert sets the value and returns true if the key was new.
 func (s *Map[K, V]) Upsert(key K, val V) (isNew bool) {
 	s.mu.Lock()
@@ -47,12 +41,6 @@ func (s *Map[K, V]) SetIfAbsent(key K, val V) bool {
 	return true
 }
 
-func (s *Map[K, V]) Delete(key K) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	delete(s.m, key)
-}
-
 // DeleteAndGet atomically removes and returns the value.
 func (s *Map[K, V]) DeleteAndGet(key K) (V, bool) {
 	s.mu.Lock()
@@ -109,18 +97,3 @@ func (s *Map[K, V]) ReplaceAll(m map[K]V) {
 	}
 	s.m = cp
 }
-
-// Prune removes entries matching a predicate under a single write lock.
-// Returns the number of entries removed.
-func (s *Map[K, V]) Prune(shouldRemove func(K, V) bool) int {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	removed := 0
-	for k, v := range s.m {
-		if shouldRemove(k, v) {
-			delete(s.m, k)
-			removed++
-		}
-	}
-	return removed
-}
diff --git a/internal/syncmap/syncmap_test.go b/internal/syncmap/syncmap_test.go
index 4014561..c8dabc9 100644
--- a/internal/syncmap/syncmap_test.go
+++ b/internal/syncmap/syncmap_test.go
@@ -5,10 +5,10 @@ import (
 	"testing"
 )
 
-func TestSetAndGet(t *testing.T) {
+func TestUpsertAndGet(t *testing.T) {
 	m := New[string, int]()
-	m.Set("a", 1)
-	m.Set("b", 2)
+	m.Upsert("a", 1)
+	m.Upsert("b", 2)
 
 	if v, ok := m.Get("a"); !ok || v != 1 {
 		t.Errorf("Get(\"a\") = %v, %v; want 1, true", v, ok)
@@ -20,7 +20,7 @@ func TestSetAndGet(t *testing.T) {
 
 func TestGet_MissingKeyReturnsZeroValueAndFalse(t *testing.T) {
 	m := New[string, int]()
-	m.Set("a", 1)
+	m.Upsert("a", 1)
 
 	v, ok := m.Get("missing")
 	if ok {
@@ -31,26 +31,29 @@ func TestGet_MissingKeyReturnsZeroValueAndFalse(t *testing.T) {
 	}
 }
 
-func TestDelete_RemovesEntry(t *testing.T) {
+func TestDeleteAndGet_RemovesEntry(t *testing.T) {
 	m := New[string, int]()
-	m.Set("a", 1)
-	m.Set("b", 2)
+	m.Upsert("a", 1)
+	m.Upsert("b", 2)
 
-	m.Delete("a")
+	v, ok := m.DeleteAndGet("a")
+	if !ok || v != 1 {
+		t.Errorf("DeleteAndGet(\"a\") = %v, %v; want 1, true", v, ok)
+	}
 
 	if _, ok := m.Get("a"); ok {
-		t.Error("Get(\"a\") after Delete: ok=true, want false")
+		t.Error("Get(\"a\") after DeleteAndGet: ok=true, want false")
 	}
 	if v, ok := m.Get("b"); !ok || v != 2 {
-		t.Errorf("Get(\"b\") after Delete(\"a\") = %v, %v; want 2, true", v, ok)
+		t.Errorf("Get(\"b\") after DeleteAndGet(\"a\") = %v, %v; want 2, true", v, ok)
 	}
 }
 
 func TestSnapshot_IteratesAllEntries(t *testing.T) {
 	m := New[string, int]()
-	m.Set("a", 1)
-	m.Set("b", 2)
-	m.Set("c", 3)
+	m.Upsert("a", 1)
+	m.Upsert("b", 2)
+	m.Upsert("c", 3)
 
 	seen := make(map[string]int)
 	for k, v := range m.Snapshot() {
@@ -73,15 +76,15 @@ func TestLen_ReturnsCorrectCount(t *testing.T) {
 		t.Errorf("empty map Len() = %d, want 0", m.Len())
 	}
 
-	m.Set("a", 1)
-	m.Set("b", 2)
+	m.Upsert("a", 1)
+	m.Upsert("b", 2)
 	if m.Len() != 2 {
 		t.Errorf("Len() = %d, want 2", m.Len())
 	}
 
-	m.Delete("a")
+	m.DeleteAndGet("a")
 	if m.Len() != 1 {
-		t.Errorf("after Delete Len() = %d, want 1", m.Len())
+		t.Errorf("after DeleteAndGet Len() = %d, want 1", m.Len())
 	}
 }
 
@@ -94,7 +97,7 @@ func TestConcurrentAccess(t *testing.T) {
 		wg.Add(1)
 		go func(k int) {
 			defer wg.Done()
-			m.Set(k, k*2)
+			m.Upsert(k, k*2)
 		}(i)
 	}
 
diff --git a/internal/telemetry/metrics.go b/internal/telemetry/metrics.go
deleted file mode 100644
index 19faf4d..0000000
--- a/internal/telemetry/metrics.go
+++ /dev/null
@@ -1,529 +0,0 @@
-package telemetry
-
-import (
-	"context"
-	"encoding/csv"
-	"fmt"
-	"log/slog"
-	"os"
-	"path/filepath"
-	"strconv"
-	"sync"
-	"time"
-
-	"dlockss/internal/common"
-	"dlockss/internal/config"
-
-	"github.com/prometheus/client_golang/prometheus"
-)
-
-var (
-	// Prometheus Metrics
-	promMessagesReceived = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "dlockss_messages_received_total",
-		Help: "Total number of P2P messages received",
-	})
-	promMessagesDropped = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "dlockss_messages_dropped_total",
-		Help: "P2P messages dropped (rate limit or error)",
-	})
-	promReplicationChecks = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "dlockss_replication_checks_total",
-		Help: "Total number of replication checks performed",
-	})
-	promReplicationSuccess = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "dlockss_replication_success_total",
-		Help: "Total number of successful replication checks",
-	})
-	promReplicationFailures = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "dlockss_replication_failures_total",
-		Help: "Total number of failed replication checks",
-	})
-	promDHTQueries = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "dlockss_dht_queries_total",
-		Help: "Total number of DHT queries performed",
-	})
-	promDHTTimeouts = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "dlockss_dht_timeouts_total",
-		Help: "Total number of DHT queries that timed out",
-	})
-	promShardSplits = prometheus.NewCounter(prometheus.CounterOpts{
-		Name: "dlockss_shard_splits_total",
-		Help: "Total number of shard split events",
-	})
-
-	// Gauges
-	promPinnedFiles = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "dlockss_pinned_files",
-		Help: "Current number of files pinned locally",
-	})
-	promKnownFiles = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "dlockss_known_files",
-		Help: "Current number of files tracked in known files",
-	})
-	promActivePeers = prometheus.NewGauge(prometheus.GaugeOpts{
-		Name: "dlockss_active_peers",
-		Help: "Number of peers in the current shard",
-	})
-	// Cluster-style metrics (per shard, from CRDT)
-	promClusterPinsTotal = prometheus.NewGaugeVec(prometheus.GaugeOpts{
-		Name: "dlockss_cluster_pins_total",
-		Help: "Number of pins in the shard's CRDT consensus state",
-	}, []string{"shard"})
-	promClusterPeersTotal = prometheus.NewGaugeVec(prometheus.GaugeOpts{
-		Name: "dlockss_cluster_peers_total",
-		Help: "Number of peers in the shard's CRDT cluster (from PeerMonitor)",
-	}, []string{"shard"})
-	promClusterAllocationsTotal = prometheus.NewGaugeVec(prometheus.GaugeOpts{
-		Name: "dlockss_cluster_allocations_total",
-		Help: "Total allocation count in the shard (sum of len(allocations) over all pins)",
-	}, []string{"shard"})
-)
-
-func init() {
-	// Register metrics
-	prometheus.MustRegister(
-		promMessagesReceived,
-		promMessagesDropped,
-		promReplicationChecks,
-		promReplicationSuccess,
-		promReplicationFailures,
-		promDHTQueries,
-		promDHTTimeouts,
-		promShardSplits,
-		promPinnedFiles,
-		promKnownFiles,
-		promActivePeers,
-		promClusterPinsTotal,
-		promClusterPeersTotal,
-		promClusterAllocationsTotal,
-	)
-}
-
-// Interfaces for dependencies
-type ShardInfoProvider interface {
-	GetShardInfo() (string, int)
-}
-
-type StorageInfoProvider interface {
-	GetStorageStatus() common.StorageSnapshot
-	GetReplicationLevels() map[string]int
-}
-
-// ClusterInfoProvider supplies cluster-style metrics (pins/peers/allocations per shard).
-type ClusterInfoProvider interface {
-	GetClusterMetrics(ctx context.Context) (pinsPerShard, peersPerShard, allocationsTotalPerShard map[string]int, err error)
-}
-
-type MetricsManager struct {
-	mu  sync.RWMutex
-	cfg *config.Config
-
-	peerID string
-
-	// Metrics state
-	pinnedFilesCount              int
-	knownFilesCount               int
-	messagesReceived              int64
-	messagesDropped               int64
-	replicationChecks             int64
-	replicationSuccess            int64
-	replicationFailures           int64
-	shardSplits                   int64
-	workerPoolActive              int
-	rateLimitedPeers              int
-	filesInBackoff                int
-	lowReplicationFiles           int
-	highReplicationFiles          int
-	dhtQueries                    int64
-	dhtQueryTimeouts              int64
-	lastReportTime                time.Time
-	startTime                     time.Time
-	replicationDistribution       [11]int
-	filesAtTargetReplication      int
-	avgReplicationLevel           float64
-	filesConvergedTotal           int64
-	filesConvergedThisPeriod      int64
-	cumulativeMessagesReceived    int64
-	cumulativeMessagesDropped     int64
-	cumulativeReplicationChecks   int64
-	cumulativeReplicationSuccess  int64
-	cumulativeReplicationFailures int64
-	cumulativeDhtQueries          int64
-	cumulativeDhtQueryTimeouts    int64
-	cumulativeShardSplits         int64
-
-	// Providers
-	shardInfo   ShardInfoProvider
-	storageInfo StorageInfoProvider
-	clusterInfo ClusterInfoProvider
-	rateLimiter *common.RateLimiter
-}
-
-func NewMetricsManager(cfg *config.Config) *MetricsManager {
-	return &MetricsManager{
-		cfg:            cfg,
-		lastReportTime: time.Now(),
-		startTime:      time.Now(),
-	}
-}
-
-func (m *MetricsManager) SetPeerID(peerID string) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-	m.peerID = peerID
-}
-
-// RegisterProviders registers components that provide metrics.
-func (m *MetricsManager) RegisterProviders(s ShardInfoProvider, st StorageInfoProvider, rl *common.RateLimiter) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-	m.shardInfo = s
-	m.storageInfo = st
-	m.rateLimiter = rl
-}
-
-// RegisterClusterProvider registers the cluster metrics provider (pins/peers/allocations per shard).
-func (m *MetricsManager) RegisterClusterProvider(c ClusterInfoProvider) {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-	m.clusterInfo = c
-}
-
-func (m *MetricsManager) IncrementMessagesReceived() {
-	m.mu.Lock()
-	m.messagesReceived++
-	m.cumulativeMessagesReceived++
-	m.mu.Unlock()
-	promMessagesReceived.Inc()
-}
-
-func (m *MetricsManager) IncrementMessagesDropped() {
-	m.mu.Lock()
-	m.messagesDropped++
-	m.cumulativeMessagesDropped++
-	m.mu.Unlock()
-	promMessagesDropped.Inc()
-}
-
-func (m *MetricsManager) IncrementReplicationChecks() {
-	m.mu.Lock()
-	m.replicationChecks++
-	m.cumulativeReplicationChecks++
-	m.mu.Unlock()
-	promReplicationChecks.Inc()
-}
-
-func (m *MetricsManager) IncrementReplicationSuccess() {
-	m.mu.Lock()
-	m.replicationSuccess++
-	m.cumulativeReplicationSuccess++
-	m.mu.Unlock()
-	promReplicationSuccess.Inc()
-}
-
-func (m *MetricsManager) IncrementReplicationFailures() {
-	m.mu.Lock()
-	m.replicationFailures++
-	m.cumulativeReplicationFailures++
-	m.mu.Unlock()
-	promReplicationFailures.Inc()
-}
-
-// IncrementDHTQueries increments the number of DHT queries.
-func (m *MetricsManager) IncrementDHTQueries() {
-	m.mu.Lock()
-	m.dhtQueries++
-	m.cumulativeDhtQueries++
-	m.mu.Unlock()
-	promDHTQueries.Inc()
-}
-
-// IncrementDHTQueryTimeouts increments the number of DHT query timeouts.
-func (m *MetricsManager) IncrementDHTQueryTimeouts() {
-	m.mu.Lock()
-	m.dhtQueryTimeouts++
-	m.cumulativeDhtQueryTimeouts++
-	m.mu.Unlock()
-	promDHTTimeouts.Inc()
-}
-
-// IncrementShardSplits increments the number of shard splits.
-func (m *MetricsManager) IncrementShardSplits() {
-	m.mu.Lock()
-	m.shardSplits++
-	m.cumulativeShardSplits++
-	m.mu.Unlock()
-	promShardSplits.Inc()
-}
-
-func (m *MetricsManager) IncrementFilesConverged() {
-	m.mu.Lock()
-	m.filesConvergedTotal++
-	m.filesConvergedThisPeriod++
-	m.mu.Unlock()
-}
-
-func (m *MetricsManager) SetPinnedFilesCount(count int) {
-	m.mu.Lock()
-	m.pinnedFilesCount = count
-	m.mu.Unlock()
-	promPinnedFiles.Set(float64(count))
-}
-
-func (m *MetricsManager) SetKnownFilesCount(count int) {
-	m.mu.Lock()
-	m.knownFilesCount = count
-	m.mu.Unlock()
-	promKnownFiles.Set(float64(count))
-}
-
-func (m *MetricsManager) RunMetricsReporter(ctx context.Context) {
-	ticker := time.NewTicker(m.cfg.MetricsReportInterval)
-	defer ticker.Stop()
-
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case <-ticker.C:
-			m.UpdateGauges()
-			m.ReportMetrics()
-		}
-	}
-}
-
-func (m *MetricsManager) UpdateGauges() {
-	m.mu.Lock()
-	defer m.mu.Unlock()
-
-	// Use stored values for pinned/known since they are pushed by storage
-	promPinnedFiles.Set(float64(m.pinnedFilesCount))
-	promKnownFiles.Set(float64(m.knownFilesCount))
-
-	if m.shardInfo != nil {
-		_, activePeers := m.shardInfo.GetShardInfo()
-		promActivePeers.Set(float64(activePeers))
-	}
-
-	// Cluster-style metrics (pins/peers/allocations per shard)
-	if m.clusterInfo != nil {
-		pins, peers, allocs, err := m.clusterInfo.GetClusterMetrics(context.Background())
-		if err == nil {
-			for shard, count := range pins {
-				promClusterPinsTotal.WithLabelValues(shard).Set(float64(count))
-			}
-			for shard, count := range peers {
-				promClusterPeersTotal.WithLabelValues(shard).Set(float64(count))
-			}
-			for shard, count := range allocs {
-				promClusterAllocationsTotal.WithLabelValues(shard).Set(float64(count))
-			}
-		}
-	}
-}
-
-func (m *MetricsManager) ReportMetrics() {
-	m.mu.RLock()
-	now := time.Now()
-	elapsed := now.Sub(m.lastReportTime)
-	minutes := elapsed.Minutes()
-	if minutes < 0.1 {
-		minutes = 0.1
-	}
-
-	msgRate := float64(m.messagesReceived) / minutes
-	dropRate := float64(m.messagesDropped) / minutes
-
-	shardID := ""
-	activePeers := 0
-	if m.shardInfo != nil {
-		shardID, activePeers = m.shardInfo.GetShardInfo()
-	}
-
-	rateLimitedPeers := 0
-	if m.rateLimiter != nil {
-		rateLimitedPeers = m.rateLimiter.Size()
-	}
-
-	backoffCount := 0
-	levelsMap := make(map[string]int)
-	if m.storageInfo != nil {
-		backoffCount = m.storageInfo.GetStorageStatus().BackoffCount
-		levelsMap = m.storageInfo.GetReplicationLevels()
-	}
-
-	m.mu.RUnlock() // Unlock for calculation
-
-	distribution := [11]int{}
-	totalFiles := 0
-	totalReplication := 0
-	for _, count := range levelsMap {
-		if count >= 10 {
-			distribution[10]++
-		} else {
-			distribution[count]++
-		}
-		totalFiles++
-		totalReplication += count
-	}
-
-	avgReplication := 0.0
-	if totalFiles > 0 {
-		avgReplication = float64(totalReplication) / float64(totalFiles)
-	}
-
-	filesAtTarget := 0
-	for _, count := range levelsMap {
-		if count >= m.cfg.MinReplication && count <= m.cfg.MaxReplication {
-			filesAtTarget++
-		}
-	}
-
-	lowReplication := 0
-	highReplication := 0
-	for _, count := range levelsMap {
-		if count < m.cfg.MinReplication {
-			lowReplication++
-		} else if count > m.cfg.MaxReplication {
-			highReplication++
-		}
-	}
-
-	m.mu.Lock()
-	m.replicationDistribution = distribution
-	m.avgReplicationLevel = avgReplication
-	m.filesAtTargetReplication = filesAtTarget
-	m.lowReplicationFiles = lowReplication
-	m.highReplicationFiles = highReplication
-	m.mu.Unlock()
-
-	slog.Debug("metrics report: storage", "pinned", m.pinnedFilesCount, "known", m.knownFilesCount)
-	slog.Debug("metrics report: replication",
-		"checks", m.replicationChecks, "success", m.replicationSuccess, "failures", m.replicationFailures,
-		"low", lowReplication, "high", highReplication, "at_target", filesAtTarget)
-	slog.Debug("metrics report: replication distribution",
-		"r0", distribution[0], "r1", distribution[1], "r2", distribution[2],
-		"r3", distribution[3], "r4", distribution[4], "r5", distribution[5],
-		"r6", distribution[6], "r7", distribution[7], "r8", distribution[8],
-		"r9", distribution[9], "r10_plus", distribution[10])
-	slog.Debug("metrics report: convergence",
-		"avg_replication", avgReplication, "converged_total", m.filesConvergedTotal, "converged_this_period", m.filesConvergedThisPeriod)
-	slog.Debug("metrics report: network",
-		"msg_rate_per_min", msgRate, "drop_rate_per_min", dropRate, "active_peers", activePeers)
-	slog.Debug("metrics report: system",
-		"shard_splits", m.shardSplits, "current_shard", shardID, "rate_limited_peers", rateLimitedPeers, "files_in_backoff", backoffCount)
-	if m.clusterInfo != nil {
-		pins, peers, allocs, err := m.clusterInfo.GetClusterMetrics(context.Background())
-		if err == nil {
-			for shard := range pins {
-				slog.Debug("metrics report: cluster shard",
-					"shard", shard, "pins", pins[shard], "peers", peers[shard], "allocations_total", allocs[shard])
-			}
-		}
-	}
-	uptime := now.Sub(m.startTime)
-	slog.Debug("metrics report: cumulative",
-		"uptime", uptime.Round(time.Second),
-		"msgs", m.cumulativeMessagesReceived, "dropped", m.cumulativeMessagesDropped,
-		"checks", m.cumulativeReplicationChecks, "success", m.cumulativeReplicationSuccess,
-		"failures", m.cumulativeReplicationFailures, "shard_splits", m.cumulativeShardSplits)
-
-	if m.cfg.MetricsExportPath != "" {
-		m.ExportMetricsToFile(now)
-	}
-
-	m.mu.Lock()
-	m.lastReportTime = now
-	m.messagesReceived = 0
-	m.messagesDropped = 0
-	m.replicationChecks = 0
-	m.replicationSuccess = 0
-	m.replicationFailures = 0
-	m.filesConvergedThisPeriod = 0
-	m.mu.Unlock()
-}
-
-func (m *MetricsManager) GetStatus() common.StatusResponse {
-	m.mu.RLock()
-	pinned := m.pinnedFilesCount
-	known := m.knownFilesCount
-	startTime := m.startTime
-	avgRepl := m.avgReplicationLevel
-	atTarget := m.filesAtTargetReplication
-	dist := m.replicationDistribution
-	m.mu.RUnlock()
-
-	shardID := ""
-	peers := 0
-	if m.shardInfo != nil {
-		shardID, peers = m.shardInfo.GetShardInfo()
-	}
-
-	activeWorkers := 0
-	queueDepth := 0
-
-	var knownCIDs []string
-	if m.storageInfo != nil && m.cfg.TelemetryIncludeCIDs {
-		knownCIDs = m.storageInfo.GetStorageStatus().KnownCIDs
-	}
-
-	m.mu.RLock()
-	peerID := m.peerID
-	m.mu.RUnlock()
-
-	return common.StatusResponse{
-		PeerID:       peerID,
-		Version:      "1.0.0",
-		CurrentShard: shardID,
-		PeersInShard: peers,
-		Storage: common.StorageStatus{
-			PinnedFiles: pinned,
-			KnownFiles:  known,
-			KnownCIDs:   knownCIDs,
-		},
-		Replication: common.ReplicationStatus{
-			QueueDepth:              queueDepth,
-			ActiveWorkers:           activeWorkers,
-			AvgReplicationLevel:     avgRepl,
-			FilesAtTarget:           atTarget,
-			ReplicationDistribution: dist,
-		},
-		UptimeSeconds: time.Since(startTime).Seconds(),
-	}
-}
-
-func (m *MetricsManager) ExportMetricsToFile(timestamp time.Time) {
-	path := m.cfg.MetricsExportPath
-	if path == "" {
-		return
-	}
-
-	dir := filepath.Dir(path)
-	if err := os.MkdirAll(dir, 0755); err != nil {
-		slog.Error("failed to create metrics export directory", "error", err)
-		return
-	}
-
-	file, err := os.OpenFile(path, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
-	if err != nil {
-		slog.Error("failed to open metrics export file", "error", err)
-		return
-	}
-	defer file.Close()
-
-	writer := csv.NewWriter(file)
-	defer writer.Flush()
-
-	m.mu.RLock()
-	defer m.mu.RUnlock()
-
-	uptime := timestamp.Sub(m.startTime).Seconds()
-	record := []string{
-		timestamp.Format(time.RFC3339),
-		fmt.Sprintf("%.2f", uptime),
-		strconv.Itoa(m.pinnedFilesCount),
-		strconv.Itoa(m.knownFilesCount),
-	}
-	writer.Write(record)
-}
diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go
deleted file mode 100644
index 25183fd..0000000
--- a/internal/telemetry/telemetry.go
+++ /dev/null
@@ -1,88 +0,0 @@
-package telemetry
-
-import (
-	"context"
-	"log/slog"
-	"time"
-
-	"dlockss/internal/config"
-
-	pubsub "github.com/libp2p/go-libp2p-pubsub"
-	"github.com/libp2p/go-libp2p/core/host"
-)
-
-// No dedicated telemetry topic - nodes broadcast on their shard topics
-
-type TelemetryClient struct {
-	cfg       *config.Config
-	host      host.Host
-	ps        *pubsub.PubSub
-	metrics   *MetricsManager
-	shardInfo ShardInfoProvider
-	publisher ShardPublisher // Interface to publish to shard
-}
-
-type ShardPublisher interface {
-	PublishToShardCBOR(data []byte, shardID string)
-}
-
-func NewTelemetryClient(cfg *config.Config, h host.Host, ps *pubsub.PubSub, metrics *MetricsManager) *TelemetryClient {
-	if ps == nil {
-		slog.Warn("pubsub not available, telemetry disabled")
-		return nil
-	}
-
-	tc := &TelemetryClient{
-		cfg:     cfg,
-		host:    h,
-		ps:      ps,
-		metrics: metrics,
-	}
-
-	slog.Info("telemetry client initialized")
-
-	return tc
-}
-
-// SetShardPublisher sets the shard publisher (called after initialization to break cycle)
-func (tc *TelemetryClient) SetShardPublisher(sp ShardPublisher, sip ShardInfoProvider) {
-	tc.publisher = sp
-	tc.shardInfo = sip
-}
-
-func (tc *TelemetryClient) Start(ctx context.Context) {
-	slog.Info("starting telemetry client")
-	go tc.runLoop(ctx)
-}
-
-func (tc *TelemetryClient) runLoop(ctx context.Context) {
-	ticker := time.NewTicker(tc.cfg.TelemetryInterval)
-	defer ticker.Stop()
-
-	// Try to discover monitor immediately on startup
-	tc.pushTelemetry()
-
-	for {
-		select {
-		case <-ctx.Done():
-			return
-		case <-ticker.C:
-			tc.pushTelemetry()
-		}
-	}
-}
-
-// pushTelemetry gathers and logs local node status.
-// Publishing over pubsub is disabled: JSON telemetry causes CBOR decoding
-// errors on other nodes and is not currently parsed by the monitor.
-func (tc *TelemetryClient) pushTelemetry() {
-	if tc.metrics == nil {
-		return
-	}
-
-	status := tc.metrics.GetStatus()
-
-	slog.Debug("sending telemetry status",
-		"pinned", status.Storage.PinnedFiles, "known", status.Storage.KnownFiles,
-		"shard", status.CurrentShard, "peers", status.PeersInShard)
-}
diff --git a/internal/testutil/mocks.go b/internal/testutil/mocks.go
index e3c13ba..4b83c83 100644
--- a/internal/testutil/mocks.go
+++ b/internal/testutil/mocks.go
@@ -4,13 +4,11 @@ package testutil
 
 import (
 	"context"
-	"io"
 	"testing"
 
 	"github.com/ipfs/go-cid"
 	"github.com/libp2p/go-libp2p/core/peer"
 	"github.com/libp2p/go-libp2p/core/routing"
-	"github.com/multiformats/go-multiaddr"
 	"github.com/multiformats/go-multihash"
 )
 
@@ -20,21 +18,15 @@ type MockIPFSClient struct{}
 func (m *MockIPFSClient) ImportFile(ctx context.Context, filePath string) (cid.Cid, error) {
 	return cid.Cid{}, nil
 }
-func (m *MockIPFSClient) ImportReader(ctx context.Context, reader io.Reader) (cid.Cid, error) {
-	return cid.Cid{}, nil
-}
 func (m *MockIPFSClient) PutDagCBOR(ctx context.Context, block []byte) (cid.Cid, error) {
 	return cid.Cid{}, nil
 }
 func (m *MockIPFSClient) GetBlock(ctx context.Context, blockCID cid.Cid) ([]byte, error) {
 	return nil, nil
 }
-func (m *MockIPFSClient) PinRecursive(ctx context.Context, c cid.Cid) error          { return nil }
-func (m *MockIPFSClient) UnpinRecursive(ctx context.Context, c cid.Cid) error        { return nil }
-func (m *MockIPFSClient) IsPinned(ctx context.Context, c cid.Cid) (bool, error)      { return false, nil }
-func (m *MockIPFSClient) GetFileSize(ctx context.Context, c cid.Cid) (uint64, error) { return 0, nil }
-func (m *MockIPFSClient) GetPeerID(ctx context.Context) (string, error)              { return "mock-peer-id", nil }
-func (m *MockIPFSClient) SwarmConnect(ctx context.Context, addrs []string) error     { return nil }
+func (m *MockIPFSClient) PinRecursive(ctx context.Context, c cid.Cid) error     { return nil }
+func (m *MockIPFSClient) UnpinRecursive(ctx context.Context, c cid.Cid) error   { return nil }
+func (m *MockIPFSClient) IsPinned(ctx context.Context, c cid.Cid) (bool, error) { return false, nil }
 
 // MockDHTProvider is a no-op implementation of common.DHTProvider that also
 // satisfies routing.Routing.
@@ -63,7 +55,7 @@ var _ routing.Routing = (*MockDHTProvider)(nil)
 // MockClusterManager is a no-op implementation of clusters.ClusterManagerInterface.
 type MockClusterManager struct{}
 
-func (m *MockClusterManager) JoinShard(ctx context.Context, shardID string, bootstrapPeers []multiaddr.Multiaddr) error {
+func (m *MockClusterManager) JoinShard(ctx context.Context, shardID string) error {
 	return nil
 }
 func (m *MockClusterManager) LeaveShard(shardID string) error { return nil }
diff --git a/internal/trust/trust.go b/internal/trust/trust.go
index ef96cc6..2fa150d 100644
--- a/internal/trust/trust.go
+++ b/internal/trust/trust.go
@@ -57,10 +57,6 @@ func (tm *TrustManager) LoadTrustedPeers(path string) error {
 	return nil
 }
 
-func (tm *TrustManager) IsPeerTrusted(peerID peer.ID) bool {
-	return tm.trustedPeers.Has(peerID)
-}
-
 func (tm *TrustManager) GetTrustedPeers() []peer.ID {
 	return tm.trustedPeers.All()
 }
@@ -75,19 +71,7 @@ func (tm *TrustManager) AuthorizeIncomingSender(receivedFrom peer.ID, peerID pee
 	if receivedFrom != "" && peerID != receivedFrom {
 		return fmt.Errorf("sender_id mismatch: sender_id=%s received_from=%s", peerID.String(), receivedFrom.String())
 	}
-	if tm.trustMode == "allowlist" && !tm.IsPeerTrusted(peerID) {
-		return fmt.Errorf("sender not trusted: %s", peerID.String())
-	}
-	return nil
-}
-
-// AuthorizePeer enforces allowlist-only mode for contexts where there is no ReceivedFrom
-// (e.g., verifying a stored ResearchObject manifest).
-func (tm *TrustManager) AuthorizePeer(peerID peer.ID) error {
-	if peerID == "" {
-		return fmt.Errorf("missing sender_id")
-	}
-	if tm.trustMode == "allowlist" && !tm.IsPeerTrusted(peerID) {
+	if tm.trustMode == "allowlist" && !tm.trustedPeers.Has(peerID) {
 		return fmt.Errorf("sender not trusted: %s", peerID.String())
 	}
 	return nil
diff --git a/pkg/ipfs/client.go b/pkg/ipfs/client.go
index 7abf675..00cfd91 100644
--- a/pkg/ipfs/client.go
+++ b/pkg/ipfs/client.go
@@ -3,7 +3,6 @@ package ipfs
 import (
 	"context"
 	"fmt"
-	"io"
 	"os"
 	"strings"
 
@@ -14,15 +13,11 @@ import (
 // IPFSClient is the interface for IPFS node operations.
 type IPFSClient interface {
 	ImportFile(ctx context.Context, filePath string) (cid.Cid, error)
-	ImportReader(ctx context.Context, reader io.Reader) (cid.Cid, error)
 	PutDagCBOR(ctx context.Context, block []byte) (cid.Cid, error)
 	GetBlock(ctx context.Context, blockCID cid.Cid) ([]byte, error)
 	PinRecursive(ctx context.Context, cid cid.Cid) error
 	UnpinRecursive(ctx context.Context, cid cid.Cid) error
 	IsPinned(ctx context.Context, cid cid.Cid) (bool, error)
-	GetFileSize(ctx context.Context, cid cid.Cid) (uint64, error)
-	GetPeerID(ctx context.Context) (string, error)
-	SwarmConnect(ctx context.Context, addrs []string) error
 }
 
 // Client wraps IPFS API operations for importing files and managing pins.
@@ -75,21 +70,6 @@ func (c *Client) ImportFile(ctx context.Context, filePath string) (cid.Cid, erro
 	return parsedCID, nil
 }
 
-// ImportReader imports from io.Reader into IPFS as UnixFS. ctx unused (go-ipfs-api Add has no cancel).
-func (c *Client) ImportReader(ctx context.Context, reader io.Reader) (cid.Cid, error) {
-	ipfsPath, err := c.api.Add(reader, ipfsapi.Pin(true))
-	if err != nil {
-		return cid.Cid{}, fmt.Errorf("failed to import data to IPFS: %w", err)
-	}
-
-	parsedCID, err := cid.Decode(ipfsPath)
-	if err != nil {
-		return cid.Cid{}, fmt.Errorf("failed to parse CID from IPFS path: %w", err)
-	}
-
-	return parsedCID, nil
-}
-
 // PutDagCBOR stores a dag-cbor block (e.g. ResearchObject). ctx unused (BlockPut has no cancel).
 func (c *Client) PutDagCBOR(ctx context.Context, block []byte) (cid.Cid, error) {
 	_ = ctx
@@ -136,24 +116,3 @@ func (c *Client) IsPinned(ctx context.Context, cid cid.Cid) (bool, error) {
 	_, ok := raw.Keys[cid.String()]
 	return ok, nil
 }
-
-func (c *Client) GetFileSize(ctx context.Context, cid cid.Cid) (uint64, error) {
-	stat, err := c.api.FilesStat(ctx, "/ipfs/"+cid.String())
-	if err != nil {
-		return 0, fmt.Errorf("failed to stat file: %w", err)
-	}
-
-	return uint64(stat.Size), nil
-}
-
-func (c *Client) GetPeerID(ctx context.Context) (string, error) {
-	id, err := c.api.ID()
-	if err != nil {
-		return "", fmt.Errorf("failed to get IPFS peer ID: %w", err)
-	}
-	return id.ID, nil
-}
-
-func (c *Client) SwarmConnect(ctx context.Context, addrs []string) error {
-	return c.api.SwarmConnect(ctx, addrs...)
-}
diff --git a/pkg/ipfs/dht_adapter.go b/pkg/ipfs/dht_adapter.go
index 8e3c621..6aba291 100644
--- a/pkg/ipfs/dht_adapter.go
+++ b/pkg/ipfs/dht_adapter.go
@@ -27,17 +27,15 @@ type provideRequest struct {
 // IPFSDHTAdapter implements DHTProvider using IPFS's DHT via HTTP API
 // It also implements routing.Routing to be compatible with libp2p/ipfs-cluster.
 type IPFSDHTAdapter struct {
-	api             *ipfsapi.Shell
-	retryAttempts   int
-	retryDelay      time.Duration
-	provideTimeout  time.Duration // Timeout for provide operations
-	provideInterval time.Duration // Min delay between provide ops (0 = no delay) to avoid overwhelming the DHT
-	provideQueue    chan *provideRequest
-	workerCtx       context.Context
-	workerCancel    context.CancelFunc
-	workerStarted   bool
-	workerMu        sync.Mutex
-	intervalMu      sync.RWMutex
+	api            *ipfsapi.Shell
+	retryAttempts  int
+	retryDelay     time.Duration
+	provideTimeout time.Duration // Timeout for provide operations
+	provideQueue   chan *provideRequest
+	workerCtx      context.Context
+	workerCancel   context.CancelFunc
+	workerStarted  bool
+	workerMu       sync.Mutex
 }
 
 var _ routing.Routing = (*IPFSDHTAdapter)(nil)
@@ -45,39 +43,13 @@ var _ routing.Routing = (*IPFSDHTAdapter)(nil)
 // NewIPFSDHTAdapterFromClient creates a DHT adapter from an ipfs.Client,
 // avoiding the need for callers to access the raw Shell.
 func NewIPFSDHTAdapterFromClient(c *Client) *IPFSDHTAdapter {
-	return NewIPFSDHTAdapter(c.shell())
-}
-
-// NewIPFSDHTAdapter creates a new DHT adapter that uses IPFS's DHT
-func NewIPFSDHTAdapter(api *ipfsapi.Shell) *IPFSDHTAdapter {
-	ctx, cancel := context.WithCancel(context.Background())
-	adapter := &IPFSDHTAdapter{
-		api:            api,
-		retryAttempts:  3,                               // Default retry attempts
-		retryDelay:     500 * time.Millisecond,          // Default retry delay
-		provideTimeout: 60 * time.Second,                // Default timeout
-		provideQueue:   make(chan *provideRequest, 100), // Buffer up to 100 queued operations
-		workerCtx:      ctx,
-		workerCancel:   cancel,
-	}
-	adapter.startWorker()
-	return adapter
-}
-
-// NewIPFSDHTAdapterWithRetry creates a new DHT adapter with custom retry configuration
-func NewIPFSDHTAdapterWithRetry(api *ipfsapi.Shell, retryAttempts int, retryDelay time.Duration) *IPFSDHTAdapter {
-	return NewIPFSDHTAdapterWithTimeout(api, retryAttempts, retryDelay, 60*time.Second)
-}
-
-// NewIPFSDHTAdapterWithTimeout creates a new DHT adapter with custom retry and timeout configuration
-func NewIPFSDHTAdapterWithTimeout(api *ipfsapi.Shell, retryAttempts int, retryDelay time.Duration, provideTimeout time.Duration) *IPFSDHTAdapter {
 	ctx, cancel := context.WithCancel(context.Background())
 	adapter := &IPFSDHTAdapter{
-		api:            api,
-		retryAttempts:  retryAttempts,
-		retryDelay:     retryDelay,
-		provideTimeout: provideTimeout,
-		provideQueue:   make(chan *provideRequest, 100), // Buffer up to 100 queued operations
+		api:            c.shell(),
+		retryAttempts:  3,
+		retryDelay:     500 * time.Millisecond,
+		provideTimeout: 60 * time.Second,
+		provideQueue:   make(chan *provideRequest, 100),
 		workerCtx:      ctx,
 		workerCancel:   cancel,
 	}
@@ -85,14 +57,6 @@ func NewIPFSDHTAdapterWithTimeout(api *ipfsapi.Shell, retryAttempts int, retryDe
 	return adapter
 }
 
-// SetProvideInterval sets the minimum delay between processing provide operations.
-// Use > 0 (e.g. 3s) to avoid overwhelming the DHT when many files are announced.
-func (a *IPFSDHTAdapter) SetProvideInterval(d time.Duration) {
-	a.intervalMu.Lock()
-	defer a.intervalMu.Unlock()
-	a.provideInterval = d
-}
-
 // startWorker starts the worker goroutine that processes provide operations one at a time
 func (a *IPFSDHTAdapter) startWorker() {
 	a.workerMu.Lock()
@@ -114,17 +78,6 @@ func (a *IPFSDHTAdapter) worker() {
 			return
 		case req := <-a.provideQueue:
 			a.processProvideRequest(req)
-
-			a.intervalMu.RLock()
-			interval := a.provideInterval
-			a.intervalMu.RUnlock()
-			if interval > 0 {
-				select {
-				case <-a.workerCtx.Done():
-					return
-				case <-time.After(interval):
-				}
-			}
 		}
 	}
 }
@@ -152,11 +105,6 @@ func (a *IPFSDHTAdapter) processProvideRequest(req *provideRequest) {
 	}
 }
 
-// Close shuts down the worker goroutine
-func (a *IPFSDHTAdapter) Close() {
-	a.workerCancel()
-}
-
 // FindProvidersAsync finds providers of a CID using IPFS DHT
 func (a *IPFSDHTAdapter) FindProvidersAsync(ctx context.Context, key cid.Cid, count int) <-chan peer.AddrInfo {
 	ch := make(chan peer.AddrInfo, 10)
diff --git a/pkg/schema/messages.go b/pkg/schema/messages.go
index f464038..548238b 100644
--- a/pkg/schema/messages.go
+++ b/pkg/schema/messages.go
@@ -12,7 +12,6 @@ type MessageType uint8
 const (
 	MessageTypeIngest MessageType = iota + 1
 	MessageTypeReplicationRequest
-	MessageTypeUnreplicateRequest
 )
 
 // SignedEnvelope holds the fields common to every signed protocol message:
@@ -88,8 +87,7 @@ type Signable interface {
 // IngestMessage announces a new ResearchObject for ingestion.
 type IngestMessage struct {
 	SignedEnvelope
-	ShardID  string `cbor:"shard_id"`  // Target shard prefix
-	HintSize uint64 `cbor:"hint_size"` // Total size in bytes
+	ShardID string `cbor:"shard_id"` // Target shard prefix
 }
 
 func (m *IngestMessage) GetEnvelope() *SignedEnvelope { return &m.SignedEnvelope }
@@ -97,23 +95,10 @@ func (m *IngestMessage) GetEnvelope() *SignedEnvelope { return &m.SignedEnvelope
 // ReplicationRequest asks peers to replicate a ResearchObject.
 type ReplicationRequest struct {
 	SignedEnvelope
-	Priority uint8 `cbor:"priority"` // 0=Low, 1=High
-	Deadline int64 `cbor:"deadline"` // Unix timestamp deadline (0 = no deadline)
 }
 
 func (m *ReplicationRequest) GetEnvelope() *SignedEnvelope { return &m.SignedEnvelope }
 
-// UnreplicateRequest asks peers to drop over-replicated files.
-// Peers use deterministic selection (hash of ManifestCID + PeerID) to decide
-// whether to drop, ensuring distributed consensus without coordination.
-type UnreplicateRequest struct {
-	SignedEnvelope
-	ExcessCount  int `cbor:"excess_count"`  // How many replicas to drop
-	CurrentCount int `cbor:"current_count"` // Current replication count
-}
-
-func (m *UnreplicateRequest) GetEnvelope() *SignedEnvelope { return &m.SignedEnvelope }
-
 // marshalFields builds a CBOR map from envelope prefix + message-specific + envelope suffix fields.
 func marshalFields(env *SignedEnvelope, specific []cborKV, includeSig bool) ([]byte, error) {
 	fields := env.prefixFields()
@@ -135,7 +120,6 @@ func (m *IngestMessage) MarshalCBORForSigning() ([]byte, error) {
 func (m *IngestMessage) specificFields() []cborKV {
 	return []cborKV{
 		{"shard_id", m.ShardID},
-		{"hint_size", int64(m.HintSize)},
 	}
 }
 
@@ -151,11 +135,6 @@ func (m *IngestMessage) UnmarshalCBOR(data []byte) error {
 	if err != nil {
 		return err
 	}
-	sizeInt, err := readInt(node, "hint_size")
-	if err != nil {
-		return err
-	}
-	m.HintSize = uint64(sizeInt)
 	return nil
 }
 
@@ -170,10 +149,7 @@ func (m *ReplicationRequest) MarshalCBORForSigning() ([]byte, error) {
 }
 
 func (m *ReplicationRequest) specificFields() []cborKV {
-	return []cborKV{
-		{"priority", int64(m.Priority)},
-		{"deadline", m.Deadline},
-	}
+	return nil
 }
 
 func (m *ReplicationRequest) UnmarshalCBOR(data []byte) error {
@@ -184,14 +160,5 @@ func (m *ReplicationRequest) UnmarshalCBOR(data []byte) error {
 	if err := m.unmarshalEnvelope(node); err != nil {
 		return err
 	}
-	priorityInt, err := readInt(node, "priority")
-	if err != nil {
-		return err
-	}
-	m.Priority = uint8(priorityInt)
-	m.Deadline, err = readInt(node, "deadline")
-	if err != nil {
-		return err
-	}
 	return nil
 }
diff --git a/pkg/schema/messages_unreplicate.go b/pkg/schema/messages_unreplicate.go
deleted file mode 100644
index df0a1b4..0000000
--- a/pkg/schema/messages_unreplicate.go
+++ /dev/null
@@ -1,39 +0,0 @@
-package schema
-
-// --- UnreplicateRequest CBOR ---
-
-func (m *UnreplicateRequest) MarshalCBOR() ([]byte, error) {
-	return marshalFields(&m.SignedEnvelope, m.specificFields(), true)
-}
-
-func (m *UnreplicateRequest) MarshalCBORForSigning() ([]byte, error) {
-	return marshalFields(&m.SignedEnvelope, m.specificFields(), false)
-}
-
-func (m *UnreplicateRequest) specificFields() []cborKV {
-	return []cborKV{
-		{"excess_count", int64(m.ExcessCount)},
-		{"current_count", int64(m.CurrentCount)},
-	}
-}
-
-func (m *UnreplicateRequest) UnmarshalCBOR(data []byte) error {
-	node, err := decodeCBORMap(data)
-	if err != nil {
-		return err
-	}
-	if err := m.unmarshalEnvelope(node); err != nil {
-		return err
-	}
-	excessInt, err := readInt(node, "excess_count")
-	if err != nil {
-		return err
-	}
-	m.ExcessCount = int(excessInt)
-	currentInt, err := readInt(node, "current_count")
-	if err != nil {
-		return err
-	}
-	m.CurrentCount = int(currentInt)
-	return nil
-}
diff --git a/testnet/run_testnet.sh b/testnet/run_testnet.sh
index 0194891..8a00e65 100755
--- a/testnet/run_testnet.sh
+++ b/testnet/run_testnet.sh
@@ -199,7 +199,6 @@ if [ ! -f "$IPFS_REPO/config" ]; then
     DLOCKSS_IPFS_CONFIG="$IPFS_REPO_ABS/config" \
     DLOCKSS_NODE_NAME="testnet_$i" \
     DLOCKSS_TOPIC_NAME="cc" \
-    DLOCKSS_METRICS_EXPORT="metrics.csv" \
     DLOCKSS_IPFS_NODE="/ip4/127.0.0.1/tcp/$IPFS_API_PORT" \
     DLOCKSS_API_PORT=$DLOCKSS_API_PORT \
     DLOCKSS_MAX_PEERS_PER_SHARD=12 \