diff --git a/docs/specs/grpc/context.json b/docs/specs/grpc/context.json new file mode 100644 index 0000000..5e84367 --- /dev/null +++ b/docs/specs/grpc/context.json @@ -0,0 +1,92 @@ +{ + "product_vision": { + "phase_1_goal": "Build a 3-node distributed Redis-compatible KV store with Raft consensus", + "networking_requirements": { + "protocol": "gRPC on port 7379 for internal Raft RPC between nodes", + "success_criteria": [ + "Leader election <2s", + "11 chaos tests passing", + ">5,000 ops/sec performance" + ] + } + }, + "existing_features": [ + { + "name": "openraft-migration", + "status": "In Design", + "relevance": "Phase 4 includes Network Stub Implementation - gRPC will replace StubNetwork" + }, + { + "name": "resp-protocol", + "status": "Complete (100%, 487 tests)", + "relevance": "Handles client-facing Redis protocol, separate from internal gRPC" + }, + { + "name": "kv-service", + "status": "Specified, not started", + "relevance": "Will use raft crate for consensus operations" + } + ], + "architecture": { + "raft_crate_role": "Consensus + Transport Layer - wraps openraft with gRPC server/client", + "grpc_integration": { + "description": "Implements RaftNetwork trait for inter-node communication", + "location": "crates/raft/, protos in crates/raft/proto/", + "replaces": "StubNetwork placeholder with OpenRaftNetwork implementation", + "trait": "openraft::RaftNetwork", + "server_port": 7379, + "rpc_types": ["RequestVote", "AppendEntries", "InstallSnapshot"] + }, + "dependencies": { + "upstream": ["storage crate", "common crate"], + "downstream": ["kv service", "seshat binary"], + "external": [ + "openraft 0.10+", + "tonic 0.11+", + "prost 0.14+ (MUST match tonic version)" + ] + } + }, + "standards": { + "tech_stack": { + "rust_version": "1.90+", + "async_runtime": "tokio", + "grpc": "tonic 0.11+ / prost 0.14+", + "serialization": "Protocol Buffers", + "errors": "thiserror", + "logging": "tracing" + }, + "key_requirements": [ + "5s connection/request timeout", + "TDD workflow mandatory", + "Single prost version (0.14) - CRITICAL for compatibility", + "Async-first design" + ], + "testing_approach": { + "unit": "protobuf serialization, client/server logic", + "integration": "2-node message exchange", + "chaos": "11 scenarios (network partitions, failures)" + } + }, + "implementation_guidance": { + "relationship_to_openraft": "OpenRaft migration defines StubNetwork placeholder, gRPC spec replaces stub with full tonic/prost implementation", + "key_design_decisions": [ + "Connection pooling with HashMap", + "Retry logic with exponential backoff (max 3 attempts)", + "DNS-based peer discovery (hostname:port)", + "Comprehensive error mapping (tonic::Status → RaftError)" + ], + "file_organization": { + "proto_files": "crates/raft/proto/*.proto", + "server": "crates/raft/src/grpc_server.rs", + "client": "crates/raft/src/grpc_client.rs", + "network": "crates/raft/src/network.rs (replaces network_stub.rs)" + } + }, + "reference_documents": [ + "docs/specs/openraft/design.md - OpenRaft migration design", + "docs/standards/tech.md - gRPC standards", + "docs/architecture/crates.md - Raft crate architecture" + ], + "next_steps": "Ready for /spec:design grpc to generate technical design" +} diff --git a/docs/specs/grpc/design.json b/docs/specs/grpc/design.json new file mode 100644 index 0000000..e1a59e6 --- /dev/null +++ b/docs/specs/grpc/design.json @@ -0,0 +1,458 @@ +{ + "feature": "grpc", + "overview": { + "description": "gRPC network layer that replaces StubNetwork placeholder for OpenRaft consensus", + "current_state": "StubNetwork placeholder exists (from OpenRaft Phase 4)", + "target_state": "Production-ready gRPC transport with connection pooling and retry logic", + "priority": "HIGH (blocks 3-node cluster testing)", + "estimated_effort_hours": "15-22" + }, + "design": { + "domain_model": { + "entities": [ + { + "name": "OpenRaftNetwork", + "file": "crates/raft/src/network.rs", + "implements": "openraft::RaftNetwork", + "responsibilities": [ + "Implements OpenRaft's RaftNetwork trait", + "Manages connection pool to peer nodes", + "Coordinates retry logic for transient failures", + "Routes Raft RPC calls to appropriate gRPC clients", + "Converts gRPC errors to OpenRaft errors" + ], + "methods": [ + "send_append_entries(target: u64, req: AppendEntriesRequest) -> Result", + "send_vote(target: u64, req: VoteRequest) -> Result", + "send_install_snapshot(target: u64, req: InstallSnapshotRequest) -> Result", + "get_or_create_client(node_id: u64) -> Result" + ], + "dependencies": ["ConnectionPool", "RaftGrpcClient", "common::NodeId"] + }, + { + "name": "RaftGrpcServer", + "file": "crates/raft/src/grpc_server.rs", + "implements": "raft_service_server::RaftService (generated from protobuf)", + "responsibilities": [ + "Receives incoming Raft RPCs from peer nodes", + "Deserializes protobuf messages to OpenRaft types", + "Delegates to local RaftNode for processing", + "Serializes responses back to protobuf", + "Handles streaming for large snapshot transfers" + ], + "methods": [ + "request_vote(&self, request: tonic::Request) -> Result>", + "append_entries(&self, request: tonic::Request) -> Result>", + "install_snapshot(&self, request: tonic::Request>) -> Result>" + ], + "dependencies": ["Arc", "protobuf conversion utilities"] + }, + { + "name": "RaftGrpcClient", + "file": "crates/raft/src/grpc_client.rs", + "wraps": "tonic-generated gRPC client for a single peer", + "responsibilities": [ + "Implements retry logic with exponential backoff", + "Handles timeout enforcement per RPC type", + "Converts protobuf types to/from OpenRaft types", + "Manages streaming for snapshot installation" + ], + "methods": [ + "call_vote(&mut self, req: VoteRequest) -> Result", + "call_append_entries(&mut self, req: AppendEntriesRequest) -> Result", + "call_install_snapshot(&mut self, req: InstallSnapshotRequest) -> Result", + "is_connected(&self) -> bool", + "reconnect(&mut self) -> Result<()>" + ], + "dependencies": ["tonic::transport::Channel", "RaftServiceClient"] + }, + { + "name": "ConnectionPool", + "file": "crates/raft/src/connection_pool.rs", + "data_structure": "HashMap wrapped in RwLock", + "responsibilities": [ + "Manages lazy connection establishment to peer nodes", + "Caches active gRPC channels per node_id", + "Handles connection health monitoring", + "Provides thread-safe access via RwLock", + "Evicts stale/failed connections" + ], + "methods": [ + "get_or_connect(node_id: u64, addr: String) -> Result", + "remove(node_id: u64)", + "health_check_all() -> Vec<(u64, bool)>" + ], + "dependencies": ["tonic::transport::Channel", "common::NodeId"] + } + ], + "services": { + "connection_management": { + "description": "ConnectionPool manages peer connections with lazy initialization", + "strategy": "Connections established on first RPC call (not at startup)", + "failure_handling": "Failed connections removed from pool and retried on next call" + }, + "retry_coordination": { + "description": "RaftGrpcClient handles retry logic per RPC type", + "vote_rpcs": "No retry (election timeouts handle failures)", + "append_entries_rpcs": "Retry 3 times with exponential backoff (50ms, 100ms, 200ms)", + "install_snapshot_rpcs": "Retry 5 times with backoff (network instability tolerance)", + "transient_errors": "Trigger retries (Unavailable, DeadlineExceeded)", + "permanent_errors": "Fail immediately (InvalidArgument, NotFound)" + }, + "error_conversion": { + "description": "NetworkError enum provides bidirectional conversions", + "chain": "tonic::Status → NetworkError → openraft::RPCError", + "context_preservation": "Preserves node_id, RPC type, attempt count for debugging" + } + } + }, + "database": { + "persistence": "In-memory only (no database)", + "storage_type": "In-memory connection pool only", + "justification": "All persistent state (log, snapshots) managed by storage crate via OpenRaft", + "data_volume": "Small - connection metadata per peer (max 10 connections in Phase 1)", + "access_patterns": { + "read_heavy": "Lookup peer connection from pool on every RPC", + "write_rare": "Connection establishment only on first call or after failure" + }, + "connection_metadata": { + "ephemeral": true, + "fields": ["node_id", "addr", "channel", "client", "established_at", "last_used"] + } + }, + "endpoints": { + "grpc_service": { + "protocol": "gRPC", + "port": 7379, + "transport": "HTTP/2 (gRPC requirement)", + "tls": "Optional (Phase 4)", + "service_definition": "crates/raft/proto/raft.proto", + "rpcs": [ + { + "name": "RequestVote", + "type": "unary", + "timeout_ms": 500, + "retry": false, + "purpose": "Leader election vote request" + }, + { + "name": "AppendEntries", + "type": "unary", + "timeout_ms": 1000, + "retry": true, + "max_retries": 3, + "purpose": "Log replication and heartbeat" + }, + { + "name": "InstallSnapshot", + "type": "streaming", + "timeout_ms": 30000, + "retry": true, + "max_retries": 5, + "chunk_size_kb": 64, + "purpose": "Snapshot transfer for log compaction" + } + ] + }, + "server_lifecycle": [ + "seshat binary starts RaftGrpcServer on port 7379", + "Server binds to 0.0.0.0:7379 (all interfaces)", + "Server registers RaftService implementation", + "Incoming RPCs handled concurrently by tokio", + "Each RPC delegates to local RaftNode", + "Graceful shutdown on SIGTERM/SIGINT" + ] + }, + "events": { + "event_bus": "No event bus in Phase 1 (Raft is request-response)", + "async_patterns": [ + "All gRPC methods are async (tonic requirement)", + "ConnectionPool uses async RwLock (tokio::sync::RwLock)", + "Retry logic uses tokio::time::sleep for backoff", + "Streaming snapshots use tonic::Streaming" + ], + "future_events": "Phase 4 may add metrics events for observability" + }, + "dependencies": { + "upstream_crates": [ + { + "name": "openraft", + "purpose": "Provides RaftNetwork trait" + }, + { + "name": "common", + "purpose": "Provides NodeId, Error, configuration types" + } + ], + "downstream_crates": [ + { + "name": "seshat binary", + "purpose": "Starts gRPC server on port 7379" + }, + { + "name": "RaftNode", + "purpose": "Uses OpenRaftNetwork for consensus operations" + } + ], + "external_libraries": [ + { + "name": "tonic", + "version": "0.11+", + "purpose": "gRPC framework for Rust" + }, + { + "name": "prost", + "version": "0.14+", + "purpose": "Protobuf serialization (matches OpenRaft requirement)" + }, + { + "name": "tokio", + "version": "1.x", + "purpose": "Async runtime" + }, + { + "name": "tower", + "version": "0.4+", + "purpose": "Middleware for gRPC (timeouts, retries)" + }, + { + "name": "async-trait", + "version": "0.1", + "purpose": "Async trait support" + } + ] + }, + "error_handling": { + "error_types": [ + { + "name": "NetworkError", + "file": "crates/raft/src/network/error.rs", + "variants": [ + "ConnectionFailed { node_id, source: tonic::transport::Error }", + "Timeout { node_id, timeout_ms }", + "RpcFailed { node_id, status: tonic::Status }", + "Serialization(String)", + "Deserialization(String)", + "NodeNotFound { node_id }", + "MaxRetriesExceeded { node_id, max_retries }" + ] + } + ], + "error_propagation_chain": [ + "tonic::Status (gRPC error)", + "NetworkError (raft crate)", + "openraft::RPCError", + "RaftError (raft crate)", + "KvServiceError / SqlServiceError", + "RespValue::Error / SQL Error Response" + ], + "retry_semantics": { + "transient_errors": [ + "tonic::Code::Unavailable → Retry with backoff", + "tonic::Code::DeadlineExceeded → Retry with increased timeout", + "tonic::Code::ResourceExhausted → Retry with backoff", + "tonic::Code::Unknown → Retry once (may be transient)" + ], + "permanent_errors": [ + "tonic::Code::InvalidArgument → Fail immediately (bad request)", + "tonic::Code::NotFound → Fail immediately (wrong endpoint)", + "tonic::Code::PermissionDenied → Fail immediately (auth failure)", + "tonic::Code::Unimplemented → Fail immediately (version mismatch)" + ] + }, + "retry_config": { + "max_retries": { + "append_entries": 3, + "install_snapshot": 5 + }, + "initial_backoff_ms": 50, + "max_backoff_ms": 5000, + "backoff_multiplier": 2.0 + } + }, + "integration_points": { + "initialization_flow": { + "step_1": "Load configuration from config.toml", + "step_2": "Initialize storage (OpenRaftMemStorage)", + "step_3": "Create network layer (OpenRaftNetwork with cluster membership)", + "step_4": "Create Raft node (wraps openraft::Raft with network and storage)", + "step_5": "Start gRPC server (RaftGrpcServer on port 7379)", + "step_6": "Start RESP server (client-facing KV service on port 6379)" + }, + "raft_node_integration": { + "description": "RaftNode uses OpenRaftNetwork via OpenRaft's Raft instance", + "automatic_usage": "OpenRaft automatically uses network for RPCs when consensus operations occur" + } + }, + "performance": { + "connection_pooling": { + "strategy": "Lazy initialization (connections created on first RPC)", + "keep_alive": "HTTP/2 connection reuse (built-in with tonic)", + "timeouts": { + "request_vote_ms": 500, + "append_entries_ms": 1000, + "install_snapshot_ms": 30000 + } + }, + "streaming": { + "purpose": "InstallSnapshot uses streaming to avoid loading entire snapshot in memory", + "chunk_size_kb": 64, + "flow_control": "gRPC backpressure prevents buffer overflow", + "receiver_behavior": "Writes chunks directly to storage" + }, + "memory_management": { + "connection_pool_bound": "Bounded to cluster size (3-5 nodes in Phase 1)", + "no_unbounded_buffers": "No unbounded message buffers", + "streaming_advantage": "Prevents large snapshot memory spikes" + } + }, + "observability": { + "tracing": { + "library": "tracing crate for structured logging", + "instrumentation": [ + "All RaftNetwork methods (#[instrument] macro)", + "Connection establishment and failures", + "Retry attempts with backoff timings", + "Request/response latencies", + "Error occurrences with full context" + ], + "structured_fields": ["node_id", "operation", "term", "latency_ms", "retry_count"] + }, + "metrics_future_phase_4": [ + "raft_rpc_requests_total{type, status, target_node}", + "raft_rpc_duration_seconds{type, target_node}", + "raft_network_errors_total{type, error_code, target_node}", + "raft_connection_pool_size" + ] + }, + "testing_strategy": { + "unit_tests": [ + "Connection pool: concurrent get_or_connect", + "Retry logic: transient vs permanent error handling", + "Error conversions: tonic::Status → NetworkError → RPCError", + "Timeout enforcement per RPC type" + ], + "integration_tests": [ + "2-node cluster: leader election via gRPC", + "3-node cluster: log replication via AppendEntries", + "Snapshot transfer: streaming InstallSnapshot", + "Network partition simulation: connection failures" + ], + "chaos_tests": [ + "Network partition during election", + "Slow network (inject latency)", + "Connection failures mid-RPC", + "Snapshot transfer interruption" + ] + } + }, + "implementation_phases": [ + { + "phase": 1, + "name": "Protobuf Definitions", + "duration_hours": "1-2", + "tasks": [ + "Define raft.proto with RequestVote, AppendEntries, InstallSnapshot messages", + "Generate Rust code with prost-build", + "Create type conversions: OpenRaft types ↔ protobuf types" + ] + }, + { + "phase": 2, + "name": "RaftGrpcClient", + "duration_hours": "3-4", + "tasks": [ + "Implement client wrapper around generated tonic client", + "Add retry logic with exponential backoff", + "Implement timeout enforcement per RPC type", + "Unit tests for retry behavior" + ] + }, + { + "phase": 3, + "name": "ConnectionPool", + "duration_hours": "2-3", + "tasks": [ + "Implement connection pool with lazy initialization", + "Thread-safe access with RwLock or DashMap", + "Connection health tracking", + "Unit tests for concurrent access" + ] + }, + { + "phase": 4, + "name": "OpenRaftNetwork", + "duration_hours": "2-3", + "tasks": [ + "Implement RaftNetwork trait", + "Integrate ConnectionPool", + "Route RPCs to appropriate clients", + "Error conversion to OpenRaft types" + ] + }, + { + "phase": 5, + "name": "RaftGrpcServer", + "duration_hours": "3-4", + "tasks": [ + "Implement gRPC service trait", + "Delegate to local RaftNode", + "Handle streaming for snapshots", + "Integration tests with 2-node cluster" + ] + }, + { + "phase": 6, + "name": "Integration & Testing", + "duration_hours": "4-6", + "tasks": [ + "Wire into seshat binary", + "3-node cluster integration tests", + "Leader election via gRPC", + "Log replication validation", + "Snapshot transfer test" + ] + } + ], + "success_criteria": [ + "3-node cluster performs leader election via gRPC", + "Log replication works via AppendEntries RPCs", + "Snapshot transfer completes via streaming InstallSnapshot", + "Retry logic handles transient connection failures", + "Connection pool reuses connections efficiently", + "No prost version conflicts (all use 0.14)", + "All unit tests pass (coverage >80%)", + "Integration tests pass (2-node and 3-node clusters)" + ], + "future_enhancements": { + "phase_3": [ + "Dynamic cluster membership (add/remove nodes via Raft configuration changes)", + "Service discovery integration (replace static hostname:port with DNS-SD or Consul)", + "Connection health checks and automatic reconnection" + ], + "phase_4": [ + "TLS/mTLS for secure node-to-node communication", + "OpenTelemetry metrics (request latency, retry counts, connection pool size)", + "Prometheus endpoint for network metrics" + ], + "optimizations": [ + "gRPC streaming for large snapshot transfers (replace chunked InstallSnapshot)", + "Connection pooling with circuit breaker pattern for failing nodes", + "Adaptive timeout adjustment based on measured latencies", + "Compression for AppendEntries messages (reduce network bandwidth)" + ] + }, + "references": [ + "docs/specs/openraft/design.md - OpenRaft migration design (StubNetwork replacement)", + "docs/standards/tech.md - gRPC standards (tonic 0.11+, prost 0.14+)", + "docs/architecture/crates.md - Raft crate architecture (consensus + transport layer)", + "docs/standards/practices.md - TDD workflow and chaos testing requirements" + ], + "metadata": { + "created": "2025-10-26", + "feature": "grpc-network-layer", + "phase": "1B (Post-OpenRaft Migration)", + "priority": "HIGH", + "blocks": "3-node cluster testing" + } +} diff --git a/docs/specs/grpc/design.md b/docs/specs/grpc/design.md new file mode 100644 index 0000000..573ce62 --- /dev/null +++ b/docs/specs/grpc/design.md @@ -0,0 +1,540 @@ +# Technical Design: gRPC Network Layer for OpenRaft + +## Overview + +This design describes the gRPC network layer that replaces the `StubNetwork` placeholder created during the OpenRaft migration. The gRPC layer provides inter-node communication for Raft consensus, implementing the `RaftNetwork` trait with connection pooling, retry logic, and streaming support for snapshots. + +**Current State**: StubNetwork placeholder exists (from OpenRaft Phase 4) +**Target State**: Production-ready gRPC transport with connection pooling and retry logic + +## Technical Needs Analysis + +### 1. Domain Model + +The gRPC network layer's "domain" is distributed communication between Raft nodes. Core entities manage connections, message serialization, and error handling. + +#### Core Entities + +**OpenRaftNetwork (RaftNetwork Implementation)** +- **Responsibilities**: + - Implements OpenRaft's `RaftNetwork` trait + - Manages connection pool to peer nodes + - Coordinates retry logic for transient failures + - Routes Raft RPC calls to appropriate gRPC clients + - Converts gRPC errors to OpenRaft errors +- **Methods**: + - `send_append_entries(target: u64, req: AppendEntriesRequest) -> Result` + - `send_vote(target: u64, req: VoteRequest) -> Result` + - `send_install_snapshot(target: u64, req: InstallSnapshotRequest) -> Result` + - `get_or_create_client(node_id: u64) -> Result` (internal) +- **Dependencies**: ConnectionPool, RaftGrpcClient, common::NodeId + +**RaftGrpcServer** +- **Responsibilities**: + - Implements generated `raft_service_server::RaftService` trait from protobuf + - Receives incoming Raft RPCs from peer nodes + - Deserializes protobuf messages to OpenRaft types + - Delegates to local RaftNode for processing + - Serializes responses back to protobuf + - Handles streaming for large snapshot transfers +- **Methods**: + - `request_vote(&self, request: tonic::Request) -> Result>` + - `append_entries(&self, request: tonic::Request) -> Result>` + - `install_snapshot(&self, request: tonic::Request>) -> Result>` +- **Dependencies**: Arc, protobuf conversion utilities + +**RaftGrpcClient** +- **Responsibilities**: + - Wraps tonic-generated gRPC client for a single peer + - Implements retry logic with exponential backoff + - Handles timeout enforcement per RPC type + - Converts protobuf types to/from OpenRaft types + - Manages streaming for snapshot installation +- **Methods**: + - `call_vote(&mut self, req: VoteRequest) -> Result` (with retry) + - `call_append_entries(&mut self, req: AppendEntriesRequest) -> Result` (with retry) + - `call_install_snapshot(&mut self, req: InstallSnapshotRequest) -> Result` (streaming) + - `is_connected(&self) -> bool` + - `reconnect(&mut self) -> Result<()>` +- **Dependencies**: tonic::transport::Channel, RaftServiceClient (generated) + +**ConnectionPool** +- **Responsibilities**: + - Manages lazy connection establishment to peer nodes + - Caches active gRPC channels per node_id + - Handles connection health monitoring + - Provides thread-safe access via RwLock or DashMap + - Evicts stale/failed connections +- **Methods**: + - `get_or_connect(node_id: u64, addr: String) -> Result` + - `remove(node_id: u64)` (for connection failures) + - `health_check_all() -> Vec<(u64, bool)>` +- **Data Structure**: `HashMap` wrapped in `RwLock` +- **Dependencies**: tonic::transport::Channel, common::NodeId + +#### Service Responsibilities + +**Connection Management Service** +- ConnectionPool manages peer connections with lazy initialization +- Connections established on first RPC call (not at startup) +- Failed connections removed from pool and retried on next call +- Health checks run periodically (future enhancement) + +**Retry Coordination Service** +- RaftGrpcClient handles retry logic per RPC type +- Vote RPCs: No retry (election timeouts handle failures) +- AppendEntries RPCs: Retry 3 times with exponential backoff (50ms, 100ms, 200ms) +- InstallSnapshot RPCs: Retry 5 times with backoff (network instability tolerance) +- Transient errors (Unavailable, DeadlineExceeded) trigger retries +- Permanent errors (InvalidArgument, NotFound) fail immediately + +**Error Conversion Service** +- NetworkError enum provides bidirectional conversions +- tonic::Status → NetworkError (maps gRPC codes to domain errors) +- NetworkError → openraft::RPCError (integrates with OpenRaft) +- Preserves error context for debugging (node_id, RPC type, attempt count) + +### 2. Data Requirements + +#### Persistence: In-Memory Only +- **Storage Type**: In-memory connection pool only +- **No Database**: gRPC layer is stateless - connection pool recreated on restart +- **Justification**: All persistent state (log, snapshots) managed by storage crate via OpenRaft +- **Data Volume**: Small - connection metadata per peer (max 10 connections in Phase 1) +- **Access Patterns**: + - Read-heavy: Lookup peer connection from pool on every RPC + - Write-rare: Connection establishment only on first call or after failure + +#### Connection Metadata (Ephemeral) +```rust +struct ConnectionMetadata { + node_id: u64, + addr: String, + channel: tonic::transport::Channel, + client: RaftServiceClient, + established_at: Instant, + last_used: Instant, +} +``` + +### 3. Router Layer (gRPC Service) + +#### No Traditional HTTP Router +The gRPC server replaces the traditional HTTP router pattern used in service layers. + +**RaftService gRPC Definition** (`crates/raft/proto/raft.proto`): +```protobuf +service RaftService { + rpc RequestVote(VoteRequest) returns (VoteResponse); + rpc AppendEntries(AppendEntriesRequest) returns (AppendEntriesResponse); + rpc InstallSnapshot(stream InstallSnapshotRequest) returns (InstallSnapshotResponse); +} +``` + +**Server Configuration**: +- **Port**: 7379 (configurable) +- **Protocol**: HTTP/2 (gRPC requirement) +- **TLS**: Optional (Phase 4 - production readiness) +- **Concurrency**: Tokio runtime handles concurrent RPCs +- **Backpressure**: gRPC flow control built-in + +**Server Lifecycle**: +1. seshat binary starts RaftGrpcServer on port 7379 +2. Server binds to `0.0.0.0:7379` (all interfaces) +3. Server registers RaftService implementation +4. Incoming RPCs handled concurrently by tokio +5. Each RPC delegates to local RaftNode +6. Graceful shutdown on SIGTERM/SIGINT + +### 4. Events & Async Patterns + +#### No Event Bus in Phase 1 +- Raft is inherently request-response (synchronous RPC semantics) +- No async event bus needed for core consensus +- Future Phase 4 may add metrics events for observability + +#### Async Patterns Used +- All gRPC methods are async (tonic requirement) +- ConnectionPool uses async RwLock (tokio::sync::RwLock) +- Retry logic uses tokio::time::sleep for backoff +- Streaming snapshots use tonic::Streaming + +### 5. Dependencies + +#### Upstream Crates (Seshat) +- **openraft**: Provides `RaftNetwork` trait +- **common**: Provides `NodeId`, `Error`, configuration types + +#### Downstream Crates (Seshat) +- **seshat binary**: Starts gRPC server on port 7379 +- **RaftNode**: Uses OpenRaftNetwork for consensus operations + +#### External Libraries +- **tonic 0.11+**: gRPC framework for Rust +- **prost 0.14+**: Protobuf serialization (matches OpenRaft requirement) +- **tokio 1.x**: Async runtime +- **tower 0.4+**: Middleware for gRPC (timeouts, retries) +- **async-trait 0.1**: Async trait support + +### 6. Error Handling Strategy + +#### Error Types + +**NetworkError** (crates/raft/src/network/error.rs): +```rust +#[derive(Debug, thiserror::Error)] +pub enum NetworkError { + #[error("Connection failed to node {node_id}: {source}")] + ConnectionFailed { + node_id: u64, + #[source] + source: tonic::transport::Error, + }, + + #[error("RPC timeout after {timeout_ms}ms to node {node_id}")] + Timeout { + node_id: u64, + timeout_ms: u64, + }, + + #[error("RPC failed to node {node_id}: {status}")] + RpcFailed { + node_id: u64, + status: tonic::Status, + }, + + #[error("Serialization error: {0}")] + Serialization(String), + + #[error("Deserialization error: {0}")] + Deserialization(String), + + #[error("Node {node_id} not found in cluster membership")] + NodeNotFound { node_id: u64 }, + + #[error("Max retries ({max_retries}) exceeded for node {node_id}")] + MaxRetriesExceeded { + node_id: u64, + max_retries: u32, + }, +} +``` + +#### Error Propagation Chain + +``` +tonic::Status (gRPC error) + ↓ (convert in RaftGrpcClient) +NetworkError (raft crate) + ↓ (convert in OpenRaftNetwork) +openraft::RPCError (openraft crate) + ↓ (handled by OpenRaft) +RaftError (raft crate - if OpenRaft propagates up) + ↓ (convert in service layer) +KvServiceError / SqlServiceError + ↓ (format for client) +RespValue::Error / SQL Error Response +``` + +#### Retry Semantics + +**Transient Errors (Retry)**: +- `tonic::Code::Unavailable` → Retry with backoff +- `tonic::Code::DeadlineExceeded` → Retry with increased timeout +- `tonic::Code::ResourceExhausted` → Retry with backoff +- `tonic::Code::Unknown` → Retry once (may be transient) + +**Permanent Errors (Fail Fast)**: +- `tonic::Code::InvalidArgument` → Fail immediately (bad request) +- `tonic::Code::NotFound` → Fail immediately (wrong endpoint) +- `tonic::Code::PermissionDenied` → Fail immediately (auth failure) +- `tonic::Code::Unimplemented` → Fail immediately (version mismatch) + +**Retry Configuration**: +```rust +struct RetryConfig { + max_retries: u32, // 3 for AppendEntries, 5 for snapshots + initial_backoff_ms: u64, // 50ms + max_backoff_ms: u64, // 5000ms (5 seconds) + backoff_multiplier: f64, // 2.0 (exponential) +} +``` + +### 7. Integration Points + +#### Initialization Flow (seshat binary) +```rust +// crates/seshat/src/main.rs +async fn main() -> Result<()> { + // 1. Load configuration + let config = Config::load()?; + + // 2. Initialize storage + let storage = OpenRaftMemStorage::new()?; + + // 3. Create network layer + let network = OpenRaftNetwork::new( + config.node_id, + config.cluster_membership.clone(), + ); + + // 4. Create Raft node + let raft_node = RaftNode::new( + config.node_id, + Arc::new(config.raft_config), + Arc::new(network), + Arc::new(storage), + ).await?; + + // 5. Start gRPC server + let grpc_server = RaftGrpcServer::new(Arc::clone(&raft_node)); + let grpc_addr = format!("0.0.0.0:{}", config.raft_port); + + tokio::spawn(async move { + tonic::transport::Server::builder() + .add_service(RaftServiceServer::new(grpc_server)) + .serve(grpc_addr.parse().unwrap()) + .await + }); + + // 6. Start RESP server (client-facing) + // ... KV service initialization ... + + Ok(()) +} +``` + +#### RaftNode Integration +```rust +// RaftNode uses OpenRaftNetwork via OpenRaft's Raft instance +impl RaftNode { + async fn new( + id: u64, + config: Arc, + network: Arc, + storage: Arc, + ) -> Result { + // OpenRaft automatically uses network for RPCs + let raft = Raft::new(id, config, network, storage).await?; + Ok(Self { raft, storage }) + } +} +``` + +### 8. Performance Considerations + +#### Connection Pooling Strategy +- **Lazy Initialization**: Connections created on first RPC (not at startup) +- **Keep-Alive**: HTTP/2 connection reuse (built-in with tonic) +- **Timeout per RPC Type**: + - RequestVote: 500ms (election timeout sensitive) + - AppendEntries: 1000ms (heartbeat/replication) + - InstallSnapshot: 30000ms (large data transfer) + +#### Streaming for Large Snapshots +- InstallSnapshot uses streaming to avoid loading entire snapshot in memory +- Chunk size: 64KB per message +- Flow control via gRPC backpressure +- Receiver writes chunks directly to storage + +#### Memory Management +- Connection pool bounded to cluster size (small in Phase 1: 3-5 nodes) +- No unbounded buffers for messages +- Streaming prevents large snapshot memory spikes + +### 9. Observability + +#### Tracing Instrumentation +```rust +#[tracing::instrument(skip(self, req), fields(target = %target, term = %req.term))] +async fn send_append_entries( + &self, + target: u64, + req: AppendEntriesRequest, +) -> Result { + tracing::debug!("Sending AppendEntries to node {}", target); + // ... implementation ... +} +``` + +#### Metrics (Future - Phase 4) +- `raft_rpc_requests_total{type, status, target_node}`: Counter +- `raft_rpc_duration_seconds{type, target_node}`: Histogram +- `raft_network_errors_total{type, error_code, target_node}`: Counter +- `raft_connection_pool_size`: Gauge + +### 10. Testing Strategy + +#### Unit Tests +- Connection pool: concurrent get_or_connect +- Retry logic: transient vs permanent error handling +- Error conversions: tonic::Status → NetworkError → RPCError +- Timeout enforcement per RPC type + +#### Integration Tests +- 2-node cluster: leader election via gRPC +- 3-node cluster: log replication via AppendEntries +- Snapshot transfer: streaming InstallSnapshot +- Network partition simulation: connection failures + +#### Chaos Tests (Future - Phase 1D) +- Network partition during election +- Slow network (inject latency) +- Connection failures mid-RPC +- Snapshot transfer interruption + +## Architecture Diagrams + +### Component Interaction + +``` +┌─────────────────────────────────────────────────────────┐ +│ Seshat Binary │ +│ ┌────────────────────────────────────────────────────┐ │ +│ │ RaftGrpcServer (port 7379) │ │ +│ │ - Receives RPCs from peers │ │ +│ │ - Delegates to local RaftNode │ │ +│ └─────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────▼──────────────────────────────────┐ │ +│ │ RaftNode (OpenRaft) │ │ +│ │ - Consensus logic │ │ +│ │ - Uses OpenRaftNetwork for outbound RPCs │ │ +│ └─────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────▼──────────────────────────────────┐ │ +│ │ OpenRaftNetwork (RaftNetwork trait) │ │ +│ │ - send_append_entries() │ │ +│ │ - send_vote() │ │ +│ │ - send_install_snapshot() │ │ +│ └─────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────▼──────────────────────────────────┐ │ +│ │ ConnectionPool │ │ +│ │ - HashMap │ │ +│ │ - Lazy connection establishment │ │ +│ └─────────────────┬──────────────────────────────────┘ │ +│ │ │ +│ ┌─────────────────▼──────────────────────────────────┐ │ +│ │ RaftGrpcClient (per peer) │ │ +│ │ - tonic Channel │ │ +│ │ - Retry logic │ │ +│ │ - Timeout enforcement │ │ +│ └─────────────────┬──────────────────────────────────┘ │ +└────────────────────┼──────────────────────────────────────┘ + │ + │ gRPC (HTTP/2) + ▼ +┌─────────────────────────────────────────────────────────┐ +│ Peer Node (port 7379) │ +│ ┌────────────────────────────────────────────────────┐ │ +│ │ RaftGrpcServer │ │ +│ │ - Receives RPC │ │ +│ │ - Processes via RaftNode │ │ +│ │ - Returns response │ │ +│ └────────────────────────────────────────────────────┘ │ +└─────────────────────────────────────────────────────────┘ +``` + +### RPC Flow: AppendEntries + +``` +Leader Node Follower Node + │ │ + │ 1. OpenRaft triggers replication │ + │ (internal) │ + │ │ + │ 2. Calls network.send_append_entries() + ├─────────────────────────────────────▶│ + │ AppendEntriesRequest │ + │ │ + │ 3. ConnectionPool.get_or_connect() │ + │ (lookup or establish connection) │ + │ │ + │ 4. RaftGrpcClient.call_append_entries() + │ (with retry logic) │ + │ │ + │ 5. gRPC call over HTTP/2 │ + ├─────────────────────────────────────▶│ + │ Serialized protobuf │ + │ │ + │ │ 6. RaftGrpcServer receives + │ │ tonic::Request + │ │ + │ │ 7. Delegates to RaftNode + │ │ (OpenRaft processes) + │ │ + │ 8. gRPC response │ + │◀─────────────────────────────────────┤ + │ AppendEntriesResponse │ + │ │ + │ 9. Return to OpenRaft │ + │ │ +``` + +## Implementation Phases + +### Phase 1: Protobuf Definitions (1-2 hours) +- Define raft.proto with RequestVote, AppendEntries, InstallSnapshot messages +- Generate Rust code with prost-build +- Create type conversions: OpenRaft types ↔ protobuf types + +### Phase 2: RaftGrpcClient (3-4 hours) +- Implement client wrapper around generated tonic client +- Add retry logic with exponential backoff +- Implement timeout enforcement per RPC type +- Unit tests for retry behavior + +### Phase 3: ConnectionPool (2-3 hours) +- Implement connection pool with lazy initialization +- Thread-safe access with RwLock or DashMap +- Connection health tracking +- Unit tests for concurrent access + +### Phase 4: OpenRaftNetwork (2-3 hours) +- Implement RaftNetwork trait +- Integrate ConnectionPool +- Route RPCs to appropriate clients +- Error conversion to OpenRaft types + +### Phase 5: RaftGrpcServer (3-4 hours) +- Implement gRPC service trait +- Delegate to local RaftNode +- Handle streaming for snapshots +- Integration tests with 2-node cluster + +### Phase 6: Integration & Testing (4-6 hours) +- Wire into seshat binary +- 3-node cluster integration tests +- Leader election via gRPC +- Log replication validation +- Snapshot transfer test + +**Total Estimated Effort**: 15-22 hours + +## Success Criteria + +- [ ] 3-node cluster performs leader election via gRPC +- [ ] Log replication works via AppendEntries RPCs +- [ ] Snapshot transfer completes via streaming InstallSnapshot +- [ ] Retry logic handles transient connection failures +- [ ] Connection pool reuses connections efficiently +- [ ] No prost version conflicts (all use 0.14) +- [ ] All unit tests pass (coverage >80%) +- [ ] Integration tests pass (2-node and 3-node clusters) + +## Future Enhancements (Post-Phase 1) + +- **TLS Support** (Phase 4): Mutual TLS for inter-node authentication +- **Compression** (Phase 4): gRPC message compression (gzip) +- **Connection Health Checks** (Phase 3): Periodic health checks and connection eviction +- **Metrics** (Phase 4): OpenTelemetry instrumentation for observability +- **Dynamic Membership** (Phase 3): Update connection pool on membership changes + +--- + +**Created**: 2025-10-26 +**Feature**: grpc-network-layer +**Phase**: 1B (Post-OpenRaft Migration) +**Priority**: HIGH (blocks 3-node cluster testing) +**Estimated Effort**: 15-22 hours diff --git a/docs/specs/grpc/requirements.json b/docs/specs/grpc/requirements.json new file mode 100644 index 0000000..72863e8 --- /dev/null +++ b/docs/specs/grpc/requirements.json @@ -0,0 +1,32 @@ +{ + "raw_user_story": "As a Raft node, I want to communicate with other nodes over gRPC so that I can participate in leader election, log replication, and consensus operations.", + "raw_criteria": [ + "Nodes can send and receive Raft RPC messages (AppendEntries, RequestVote, InstallSnapshot) over gRPC", + "Network layer handles connection management, retries, and timeouts automatically", + "Messages are serialized using Protocol Buffers for efficient transport", + "Network errors are properly propagated to the Raft layer for handling", + "Multiple nodes can communicate concurrently without blocking" + ], + "raw_rules": [ + "gRPC service runs on port 50051 (configurable)", + "Connection timeout: 5 seconds", + "Request timeout: 10 seconds (longer for InstallSnapshot)", + "Maximum message size: 10MB (for snapshot chunks)", + "TLS optional in Phase 1, required in Phase 4" + ], + "raw_scope": { + "included": [ + "gRPC service definition for Raft RPC messages", + "Server implementation for receiving Raft messages", + "Client implementation for sending messages to peers", + "Connection pooling and lifecycle management", + "Error handling and retry logic" + ], + "excluded": [ + "TLS/mTLS authentication (Phase 4)", + "Load balancing across multiple endpoints", + "Streaming for large snapshots (optimization for later)", + "Metrics/observability (Phase 4)" + ] + } +} diff --git a/docs/specs/grpc/spec-lite.md b/docs/specs/grpc/spec-lite.md new file mode 100644 index 0000000..164fb26 --- /dev/null +++ b/docs/specs/grpc/spec-lite.md @@ -0,0 +1,50 @@ +# gRPC - Condensed Spec + +## User Story +As a Raft node, I want to communicate with other nodes over gRPC so that I can participate in leader election, log replication, and consensus operations + +## Key Criteria +1. RequestVote RPC completes within 10s with proper vote response +2. AppendEntries RPC persists log entries and acknowledges within 10s +3. InstallSnapshot RPC handles chunks up to 10MB successfully +4. Network failures propagate as RaftError types to openraft layer +5. Concurrent RPCs process without blocking +6. Connection timeout 5s, retry 3x with exponential backoff +7. Protobuf serialization using prost 0.14 + +## Critical Rules +- Port 7379 (internal Raft), distinct from 6379 (client RESP) +- Timeouts: 5s connection, 10s request, 30s InstallSnapshot +- Max message 10MB, retry 3x exponential backoff +- HashMap connection pool +- tonic::Status -> RaftError mapping required +- TLS optional Phase 1, required Phase 4 + +## Scope +**IN**: gRPC service (RequestVote/AppendEntries/InstallSnapshot), server on 7379, client with pooling, RaftNetwork trait impl, retry logic, protobuf schemas, integration tests, replaces StubNetwork + +**OUT**: TLS (Phase 4), load balancing, streaming snapshots, metrics (Phase 4), dynamic membership (Phase 3) + +## Dependencies +- openraft 0.10+ (RaftNetwork trait) +- storage crate (persistence) +- common crate (NodeId, ClusterId) +- tonic 0.11+ / prost 0.14+ (MUST match versions) +- tokio, thiserror, tracing + +## Files +``` +crates/raft/proto/raft.proto # Protobuf schema +crates/raft/src/grpc_server.rs # Server impl +crates/raft/src/grpc_client.rs # Client impl +crates/raft/src/network.rs # RaftNetwork trait (replaces network_stub.rs) +``` + +## Architecture +- **Trait**: openraft::RaftNetwork +- **Pool**: HashMap with DNS discovery +- **Error**: tonic::Status -> openraft::RaftError +- **Tests**: Unit (protobuf), Integration (2-node), Chaos (11 tests) + +## Alignment +Phase 1 MVP: 3-node cluster with Raft consensus. Enables internal node communication on 7379, supports leader election <2s and chaos tests. diff --git a/docs/specs/grpc/spec.json b/docs/specs/grpc/spec.json new file mode 100644 index 0000000..5bd88a0 --- /dev/null +++ b/docs/specs/grpc/spec.json @@ -0,0 +1,112 @@ +{ + "feature": "grpc", + "user_story": "As a Raft node, I want to communicate with other nodes over gRPC so that I can participate in leader election, log replication, and consensus operations", + "acceptance_criteria": [ + "GIVEN a 3-node cluster, WHEN node 1 sends RequestVote RPC to node 2, THEN node 2 receives the message, processes it, and returns a vote response within 10 seconds", + "GIVEN a leader node, WHEN it sends AppendEntries RPC to a follower, THEN the follower receives log entries, persists them, and acknowledges within 10 seconds", + "GIVEN a new node joining the cluster, WHEN the leader sends InstallSnapshot RPC, THEN the node receives snapshot chunks up to 10MB and applies them successfully", + "GIVEN network failure between nodes, WHEN gRPC connection fails, THEN the error is propagated to openraft layer with proper RaftError type", + "GIVEN multiple concurrent RPCs, WHEN nodes send messages simultaneously, THEN all messages are processed without blocking each other", + "GIVEN a connection timeout of 5 seconds, WHEN a peer is unreachable, THEN the client fails the request within 5 seconds and retries up to 3 times with exponential backoff", + "GIVEN protobuf serialization, WHEN messages are sent over the wire, THEN they are efficiently encoded/decoded using prost 0.14" + ], + "business_rules": [ + "gRPC server MUST run on port 7379 (internal Raft RPC, distinct from port 6379 for client RESP protocol)", + "Connection timeout MUST be 5 seconds", + "Request timeout MUST be 10 seconds for RequestVote and AppendEntries", + "Request timeout MUST be 30 seconds for InstallSnapshot (larger snapshots need more time)", + "Maximum message size MUST be 10MB to support snapshot chunks", + "Retry logic MUST use exponential backoff with maximum 3 attempts", + "TLS/mTLS is OPTIONAL in Phase 1, will become REQUIRED in Phase 4", + "All version dependencies MUST use prost 0.14 (matches tonic 0.11 requirement)", + "Connection pool MUST maintain HashMap for peer connections", + "Peer discovery MUST use DNS-based hostname:port format", + "Error handling MUST map tonic::Status to openraft RaftError types" + ], + "scope": { + "included": [ + "gRPC service definition for Raft RPC messages (RequestVote, AppendEntries, InstallSnapshot)", + "Server implementation for receiving Raft messages on port 7379", + "Client implementation for sending messages to peers", + "Implementation of openraft::RaftNetwork trait", + "Connection pooling and lifecycle management (HashMap)", + "Error handling with proper tonic::Status to RaftError mapping", + "Retry logic with exponential backoff (max 3 attempts)", + "Protocol Buffers schema definitions in crates/raft/proto/", + "Integration with storage crate for state persistence", + "Unit tests for protobuf serialization/deserialization", + "Integration tests for 2-node message exchange", + "Replacement of StubNetwork placeholder with OpenRaftNetwork" + ], + "excluded": [ + "TLS/mTLS authentication (deferred to Phase 4)", + "Load balancing across multiple endpoints (not needed for fixed 3-node cluster)", + "Streaming for large snapshots (optimization for later phases)", + "Metrics/observability (deferred to Phase 4)", + "Dynamic cluster membership (Phase 3)", + "Client-facing RESP protocol (handled by protocol-resp crate)" + ] + }, + "aligns_with": "Phase 1 goal of building a 3-node distributed Redis-compatible KV store with Raft consensus. The gRPC implementation enables internal node-to-node communication on port 7379, separate from client-facing RESP on port 6379. This directly supports the success criteria of leader election <2s and passing 11 chaos tests.", + "dependencies": [ + "openraft crate 0.10+ for RaftNetwork trait and type config", + "storage crate for state persistence (log entries, snapshots)", + "common crate for shared types (NodeId, ClusterId)", + "tonic 0.11+ for gRPC server/client implementation", + "prost 0.14+ for Protocol Buffers serialization (MUST match tonic version)", + "tokio async runtime for connection management", + "thiserror for error type definitions", + "tracing for structured logging" + ], + "conflicts": [ + { + "issue": "Port number mismatch", + "details": "Raw requirements specified port 50051, but project context requires port 7379 for internal Raft RPC", + "resolution": "Use port 7379 per project architecture standards" + }, + { + "issue": "Timeout value clarity", + "details": "InstallSnapshot needs longer timeout than standard 10s due to large message size", + "resolution": "Added specific 30s timeout rule for InstallSnapshot operations" + } + ], + "technical_details": { + "port": 7379, + "protocol": "gRPC with tonic 0.11+ / prost 0.14+", + "rpc_types": [ + "RequestVote (leader election)", + "AppendEntries (log replication)", + "InstallSnapshot (snapshot transfer)" + ], + "replaces": "StubNetwork placeholder from openraft migration spec", + "implements_trait": "openraft::RaftNetwork", + "file_structure": { + "proto_definitions": "crates/raft/proto/raft.proto", + "server_impl": "crates/raft/src/grpc_server.rs", + "client_impl": "crates/raft/src/grpc_client.rs", + "network_layer": "crates/raft/src/network.rs (replaces network_stub.rs)" + }, + "connection_strategy": { + "pooling": "HashMap", + "discovery": "DNS-based hostname:port", + "retry": "Exponential backoff, max 3 attempts" + }, + "error_mapping": "tonic::Status -> openraft::error::RaftError", + "async_runtime": "tokio", + "testing_strategy": { + "unit_tests": "Protobuf serialization, client/server initialization", + "integration_tests": "2-node message exchange scenarios", + "chaos_tests": "Included in 11 chaos test suite (network partitions, failures)" + } + }, + "validation_notes": [ + "User story properly formatted with actor (Raft node), action (communicate over gRPC), and benefit (participate in consensus)", + "All acceptance criteria converted to testable GIVEN/WHEN/THEN format", + "Business rules aligned with project standards (port 7379, prost 0.14, 5s timeout)", + "Port conflict resolved (50051 -> 7379 per architecture)", + "Scope clarified to exclude Phase 4 features (TLS, metrics)", + "Dependencies explicitly list version constraints to avoid compatibility issues", + "Technical details reference specific file locations and trait implementations", + "Ready for /spec:design phase to generate detailed technical design" + ] +} diff --git a/docs/specs/grpc/spec.md b/docs/specs/grpc/spec.md new file mode 100644 index 0000000..49d2480 --- /dev/null +++ b/docs/specs/grpc/spec.md @@ -0,0 +1,132 @@ +# gRPC Specification + +## User Story + +As a Raft node, I want to communicate with other nodes over gRPC so that I can participate in leader election, log replication, and consensus operations + +## Acceptance Criteria + +1. GIVEN a 3-node cluster, WHEN node 1 sends RequestVote RPC to node 2, THEN node 2 receives the message, processes it, and returns a vote response within 10 seconds +2. GIVEN a leader node, WHEN it sends AppendEntries RPC to a follower, THEN the follower receives log entries, persists them, and acknowledges within 10 seconds +3. GIVEN a new node joining the cluster, WHEN the leader sends InstallSnapshot RPC, THEN the node receives snapshot chunks up to 10MB and applies them successfully +4. GIVEN network failure between nodes, WHEN gRPC connection fails, THEN the error is propagated to openraft layer with proper RaftError type +5. GIVEN multiple concurrent RPCs, WHEN nodes send messages simultaneously, THEN all messages are processed without blocking each other +6. GIVEN a connection timeout of 5 seconds, WHEN a peer is unreachable, THEN the client fails the request within 5 seconds and retries up to 3 times with exponential backoff +7. GIVEN protobuf serialization, WHEN messages are sent over the wire, THEN they are efficiently encoded/decoded using prost 0.14 + +## Business Rules + +- gRPC server MUST run on port 7379 (internal Raft RPC, distinct from port 6379 for client RESP protocol) +- Connection timeout MUST be 5 seconds +- Request timeout MUST be 10 seconds for RequestVote and AppendEntries +- Request timeout MUST be 30 seconds for InstallSnapshot (larger snapshots need more time) +- Maximum message size MUST be 10MB to support snapshot chunks +- Retry logic MUST use exponential backoff with maximum 3 attempts +- TLS/mTLS is OPTIONAL in Phase 1, will become REQUIRED in Phase 4 +- All version dependencies MUST use prost 0.14 (matches tonic 0.11 requirement) +- Connection pool MUST maintain HashMap for peer connections +- Peer discovery MUST use DNS-based hostname:port format +- Error handling MUST map tonic::Status to openraft RaftError types + +## Scope + +### Included +- gRPC service definition for Raft RPC messages (RequestVote, AppendEntries, InstallSnapshot) +- Server implementation for receiving Raft messages on port 7379 +- Client implementation for sending messages to peers +- Implementation of openraft::RaftNetwork trait +- Connection pooling and lifecycle management (HashMap) +- Error handling with proper tonic::Status to RaftError mapping +- Retry logic with exponential backoff (max 3 attempts) +- Protocol Buffers schema definitions in crates/raft/proto/ +- Integration with storage crate for state persistence +- Unit tests for protobuf serialization/deserialization +- Integration tests for 2-node message exchange +- Replacement of StubNetwork placeholder with OpenRaftNetwork + +### Excluded +- TLS/mTLS authentication (deferred to Phase 4) +- Load balancing across multiple endpoints (not needed for fixed 3-node cluster) +- Streaming for large snapshots (optimization for later phases) +- Metrics/observability (deferred to Phase 4) +- Dynamic cluster membership (Phase 3) +- Client-facing RESP protocol (handled by protocol-resp crate) + +## Dependencies + +- openraft crate 0.10+ for RaftNetwork trait and type config +- storage crate for state persistence (log entries, snapshots) +- common crate for shared types (NodeId, ClusterId) +- tonic 0.11+ for gRPC server/client implementation +- prost 0.14+ for Protocol Buffers serialization (MUST match tonic version) +- tokio async runtime for connection management +- thiserror for error type definitions +- tracing for structured logging + +## Technical Details + +**Port**: 7379 (internal Raft RPC) +**Protocol**: gRPC with tonic 0.11+ / prost 0.14+ +**Async Runtime**: tokio + +### RPC Types +- RequestVote (leader election) +- AppendEntries (log replication) +- InstallSnapshot (snapshot transfer) + +### Trait Implementation +Implements `openraft::RaftNetwork` trait + +Replaces `StubNetwork` placeholder from openraft migration spec + +### File Structure +``` +crates/raft/ +├── proto/ +│ └── raft.proto # Protocol Buffers schema +├── src/ +│ ├── grpc_server.rs # Server implementation +│ ├── grpc_client.rs # Client implementation +│ └── network.rs # RaftNetwork trait (replaces network_stub.rs) +``` + +### Connection Strategy +- **Pooling**: HashMap +- **Discovery**: DNS-based hostname:port +- **Retry**: Exponential backoff, max 3 attempts + +### Error Mapping +tonic::Status -> openraft::error::RaftError + +### Testing Strategy +- **Unit tests**: Protobuf serialization, client/server initialization +- **Integration tests**: 2-node message exchange scenarios +- **Chaos tests**: Included in 11 chaos test suite (network partitions, failures) + +## Conflicts Resolved + +### Port Number Mismatch +- **Issue**: Raw requirements specified port 50051, but project context requires port 7379 for internal Raft RPC +- **Resolution**: Use port 7379 per project architecture standards + +### Timeout Value Clarity +- **Issue**: InstallSnapshot needs longer timeout than standard 10s due to large message size +- **Resolution**: Added specific 30s timeout rule for InstallSnapshot operations + +## Diagrams + + + +## Alignment + +This feature aligns with: Phase 1 goal of building a 3-node distributed Redis-compatible KV store with Raft consensus. The gRPC implementation enables internal node-to-node communication on port 7379, separate from client-facing RESP on port 6379. This directly supports the success criteria of leader election <2s and passing 11 chaos tests. + +## Implementation Guidance + +This specification is ready for the `/spec:design` phase to generate: +1. Detailed protobuf schema with all message types +2. Server implementation architecture (async handlers, connection lifecycle) +3. Client implementation architecture (connection pooling, retry logic) +4. RaftNetwork trait implementation mapping +5. Error type definitions and conversion logic +6. Test scenarios for unit and integration testing