diff --git a/Cargo.lock b/Cargo.lock index 6f98aba..f3934f1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1238,6 +1238,7 @@ dependencies = [ "serde_json", "seshat-common", "seshat-kv", + "seshat-storage", "slog", "thiserror", "tokio", @@ -1250,6 +1251,10 @@ dependencies = [ [[package]] name = "seshat-storage" version = "0.1.0" +dependencies = [ + "prost 0.11.9", + "raft", +] [[package]] name = "signal-hook-registry" diff --git a/crates/raft/Cargo.toml b/crates/raft/Cargo.toml index 0ede79f..a89e3d0 100644 --- a/crates/raft/Cargo.toml +++ b/crates/raft/Cargo.toml @@ -11,6 +11,7 @@ keywords.workspace = true [dependencies] seshat-common = { path = "../common" } seshat-kv = { path = "../kv" } +seshat-storage = { path = "../storage" } raft = { version = "0.7", default-features = false, features = ["prost-codec"] } tokio = { version = "1", features = ["full"] } serde = { version = "1", features = ["derive"] } diff --git a/crates/raft/src/lib.rs b/crates/raft/src/lib.rs index 9c78bb9..16a2731 100644 --- a/crates/raft/src/lib.rs +++ b/crates/raft/src/lib.rs @@ -44,14 +44,15 @@ pub mod config; pub mod node; pub mod state_machine; -pub mod storage; pub mod transport; // Re-export main types for convenience pub use config::{ClusterConfig, InitialMember, NodeConfig, RaftConfig}; pub use node::RaftNode; pub use state_machine::StateMachine; -pub use storage::MemStorage; + +// Re-export storage from seshat-storage crate +pub use seshat_storage::MemStorage; // Re-export raft-rs message types pub use raft::prelude::{Entry, Message, MessageType, Snapshot}; diff --git a/crates/raft/src/node.rs b/crates/raft/src/node.rs index 8914aea..c1845b2 100644 --- a/crates/raft/src/node.rs +++ b/crates/raft/src/node.rs @@ -3,8 +3,9 @@ //! The RaftNode integrates MemStorage, StateMachine, and raft-rs RawNode //! to provide a complete Raft consensus implementation. -use crate::{state_machine::StateMachine, storage::MemStorage}; +use crate::state_machine::StateMachine; use raft::RawNode; +use seshat_storage::MemStorage; /// Raft node that orchestrates consensus using raft-rs. /// diff --git a/crates/raft/src/storage.rs b/crates/raft/src/storage.rs deleted file mode 100644 index 9d6f1b3..0000000 --- a/crates/raft/src/storage.rs +++ /dev/null @@ -1,3600 +0,0 @@ -//! In-memory storage implementation for Raft consensus. -//! -//! This module provides `MemStorage`, an in-memory implementation suitable for -//! testing and development. For production use, a persistent storage backend -//! (e.g., RocksDB) should be used instead. -//! -//! # Protobuf Version Bridging -//! -//! This module uses `prost_old` (prost 0.11) to maintain compatibility with `raft-rs`, -//! which depends on prost 0.11. Our transport layer uses the latest prost 0.14 for -//! gRPC communication with tonic 0.14. The bridging happens in the transport layer -//! via binary serialization/deserialization. -//! -//! - `prost_old` (0.11): Used here for raft-rs `eraftpb` types (Entry, HardState, etc.) -//! - `prost` (0.14): Used in transport layer for gRPC wire protocol -//! -//! # Thread Safety -//! -//! All fields are wrapped in `RwLock` to provide thread-safe concurrent access. -//! Multiple readers can access the data simultaneously, but writers have exclusive access. -//! -//! ## Lock Poisoning Philosophy -//! -//! This implementation uses `.expect()` instead of `.unwrap()` for lock acquisition -//! to provide clear error messages when lock poisoning occurs. Lock poisoning indicates -//! that a thread panicked while holding the lock, leaving the data in a potentially -//! inconsistent state. -//! -//! **For Phase 1 (MemStorage)**: Lock poisoning is considered a serious bug that should -//! cause the application to panic immediately with a descriptive message. This approach -//! is acceptable because: -//! 1. MemStorage is used for testing and single-node scenarios -//! 2. Lock poisoning indicates a critical bug in the concurrent access logic -//! 3. Continuing with poisoned state would lead to data corruption -//! -//! **For Future Production Storage (RocksDB)**: Lock poisoning should be handled gracefully -//! by returning a proper error through the Raft error system, allowing the node to -//! potentially recover or fail safely without cascading panics. -//! -//! The `.expect()` messages clearly identify which lock failed, making debugging easier -//! during development and testing. - -use prost_old::Message; -use raft::eraftpb::{ConfState, Entry, HardState, Snapshot}; -use raft::{RaftState, StorageError}; -use std::sync::RwLock; - -/// In-memory storage for Raft state. -/// -/// `MemStorage` stores all Raft consensus state in memory: -/// - `hard_state`: Persistent voting state (term, vote, commit) -/// - `conf_state`: Cluster membership configuration -/// - `entries`: Log entries for replication -/// - `snapshot`: Snapshot data for log compaction -/// -/// # Examples -/// -/// ``` -/// use seshat_raft::MemStorage; -/// -/// let storage = MemStorage::new(); -/// // Storage is ready to use with default values -/// ``` -#[derive(Debug)] -pub struct MemStorage { - /// Persistent state that must survive crashes. - /// - /// Contains the current term, the candidate that received the vote - /// in the current term, and the highest log entry known to be committed. - hard_state: RwLock, - - /// Current cluster membership configuration. - /// - /// Tracks which nodes are voters, learners, and which nodes are - /// being added or removed from the cluster. - conf_state: RwLock, - - /// Log entries for state machine replication. - /// - /// Entries are indexed starting at 1. The vector may not start at index 1 - /// after log compaction (snapshot creation). - entries: RwLock>, - - /// Current snapshot for log compaction. - /// - /// Represents the state machine state at a particular point in time, - /// allowing truncation of old log entries. - snapshot: RwLock, -} - -impl MemStorage { - /// Creates a new `MemStorage` with default values. - /// - /// All fields are initialized to their default states: - /// - Empty hard state (term=0, vote=0, commit=0) - /// - Empty configuration state - /// - Empty log entries - /// - Empty snapshot - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// - /// let storage = MemStorage::new(); - /// // Storage is now ready to use - /// ``` - pub fn new() -> Self { - Self { - hard_state: RwLock::new(HardState::default()), - conf_state: RwLock::new(ConfState::default()), - entries: RwLock::new(Vec::new()), - snapshot: RwLock::new(Snapshot::default()), - } - } - - /// Returns the initial Raft state from storage. - /// - /// This method reads the current hard state and configuration state - /// from the storage and returns them as a `RaftState`. This is typically - /// called when initializing a Raft node to restore its persisted state. - /// - /// # Thread Safety - /// - /// This method acquires read locks on both `hard_state` and `conf_state`. - /// Multiple concurrent calls are safe and efficient. - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// - /// let storage = MemStorage::new(); - /// let state = storage.initial_state().unwrap(); - /// assert_eq!(state.hard_state.term, 0); - /// assert_eq!(state.hard_state.vote, 0); - /// assert_eq!(state.hard_state.commit, 0); - /// ``` - /// - /// # Errors - /// - /// Returns an error if: - /// - Lock acquisition fails (lock poisoning) - pub fn initial_state(&self) -> raft::Result { - let hard_state = self - .hard_state - .read() - .expect("Hard state lock poisoned - indicates bug in concurrent access"); - let conf_state = self - .conf_state - .read() - .expect("Conf state lock poisoned - indicates bug in concurrent access"); - - Ok(RaftState { - hard_state: hard_state.clone(), - conf_state: conf_state.clone(), - }) - } - - /// Sets the hard state of the storage. - /// - /// This is primarily used for testing and during Raft ready processing - /// to persist the updated hard state. - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// use raft::eraftpb::HardState; - /// - /// let storage = MemStorage::new(); - /// let mut hs = HardState::default(); - /// hs.term = 5; - /// hs.vote = 1; - /// hs.commit = 10; - /// storage.set_hard_state(hs); - /// - /// let state = storage.initial_state().unwrap(); - /// assert_eq!(state.hard_state.term, 5); - /// assert_eq!(state.hard_state.vote, 1); - /// assert_eq!(state.hard_state.commit, 10); - /// ``` - pub fn set_hard_state(&self, hs: HardState) { - *self - .hard_state - .write() - .expect("Hard state lock poisoned - indicates bug in concurrent access") = hs; - } - - /// Sets the configuration state of the storage. - /// - /// This is primarily used for testing and during Raft ready processing - /// to persist the updated configuration state. - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// use raft::eraftpb::ConfState; - /// - /// let storage = MemStorage::new(); - /// let mut cs = ConfState::default(); - /// cs.voters = vec![1, 2, 3]; - /// storage.set_conf_state(cs); - /// - /// let state = storage.initial_state().unwrap(); - /// assert_eq!(state.conf_state.voters, vec![1, 2, 3]); - /// ``` - pub fn set_conf_state(&self, cs: ConfState) { - *self - .conf_state - .write() - .expect("Conf state lock poisoned - indicates bug in concurrent access") = cs; - } - - /// Returns a range of log entries. - /// - /// Returns log entries in the range `[low, high)`, limiting the total size - /// to `max_size` bytes if specified. - /// - /// # Arguments - /// - /// * `low` - The inclusive lower bound of the range (first index to return) - /// * `high` - The exclusive upper bound of the range (one past the last index) - /// * `max_size` - Optional maximum total size in bytes of returned entries - /// - /// # Returns - /// - /// Returns a `Result` containing: - /// - `Ok(Vec)` - The requested entries (may be empty if low == high) - /// - `Err(StorageError::Compacted)` - If `low` is less than `first_index()` - /// - `Err(StorageError::Unavailable)` - If `high` is greater than `last_index() + 1` - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// use raft::eraftpb::Entry; - /// - /// let storage = MemStorage::new(); - /// // With empty storage, requesting any range returns empty or error - /// let result = storage.entries(1, 1, None); - /// assert!(result.is_ok()); - /// assert_eq!(result.unwrap().len(), 0); - /// ``` - pub fn entries(&self, low: u64, high: u64, max_size: Option) -> raft::Result> { - // Handle empty range first - if low >= high { - return Ok(Vec::new()); - } - - // Acquire all locks once for consistent state (fixes TOCTOU race) - let snapshot = self - .snapshot - .read() - .expect("Snapshot lock poisoned - indicates bug in concurrent access"); - let entries = self - .entries - .read() - .expect("Entries lock poisoned - indicates bug in concurrent access"); - - // Calculate first and last indices from locked state - let first = if snapshot.get_metadata().index > 0 { - snapshot.get_metadata().index + 1 - } else if !entries.is_empty() { - entries[0].index - } else { - 1 - }; - - let last = if let Some(last_entry) = entries.last() { - last_entry.index - } else { - snapshot.get_metadata().index - }; - - // Check if low is before first available entry (compacted) - if low < first { - return Err(raft::Error::Store(StorageError::Compacted)); - } - - // Check if high is beyond available entries - // Note: high can be last_index + 1 (to request all entries up to and including last_index) - if high > last + 1 { - return Err(raft::Error::Store(StorageError::Unavailable)); - } - - // Handle empty log - if entries.is_empty() { - return Ok(Vec::new()); - } - - // Calculate slice bounds - // entries vector may not start at index 1 after compaction - let offset = entries[0].index; - - // Convert logical indices to vector indices - let start_idx = (low.saturating_sub(offset)) as usize; - let end_idx = (high.saturating_sub(offset)) as usize; - - // Ensure we don't go out of bounds - let start_idx = start_idx.min(entries.len()); - let end_idx = end_idx.min(entries.len()); - - // If start >= end, return empty - if start_idx >= end_idx { - return Ok(Vec::new()); - } - - // Get the slice - let mut result = Vec::new(); - let mut total_size: u64 = 0; - - for entry in &entries[start_idx..end_idx] { - // Calculate entry size using prost's encoded_len - let entry_size = entry.encoded_len() as u64; - - // If we have a size limit and we've already added at least one entry - // and adding this entry would exceed the limit, stop - if let Some(max) = max_size { - if !result.is_empty() && total_size + entry_size > max { - break; - } - } - - result.push(entry.clone()); - total_size += entry_size; - } - - // Always return at least one entry if any are available - // (even if it exceeds max_size) - if result.is_empty() && start_idx < end_idx { - result.push(entries[start_idx].clone()); - } - - Ok(result) - } - - /// Returns the term of the entry at the given index. - /// - /// # Arguments - /// - /// * `index` - The log index to query - /// - /// # Returns - /// - /// Returns a `Result` containing: - /// - `Ok(term)` - The term of the entry at the given index - /// - `Err(StorageError::Compacted)` - If the index has been compacted - /// - `Err(StorageError::Unavailable)` - If the index is not yet available - /// - /// # Special Cases - /// - /// - `term(0)` always returns `0` (by Raft convention) - /// - If `index == snapshot.metadata.index`, returns `snapshot.metadata.term` - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// use raft::eraftpb::Entry; - /// - /// let storage = MemStorage::new(); - /// - /// // Index 0 always returns term 0 - /// assert_eq!(storage.term(0).unwrap(), 0); - /// - /// // Add entries and query their terms - /// let entries = vec![ - /// Entry { index: 1, term: 1, ..Default::default() }, - /// Entry { index: 2, term: 2, ..Default::default() }, - /// ]; - /// storage.append(&entries); - /// assert_eq!(storage.term(1).unwrap(), 1); - /// assert_eq!(storage.term(2).unwrap(), 2); - /// ``` - pub fn term(&self, index: u64) -> raft::Result { - // Special case: index 0 always has term 0 - if index == 0 { - return Ok(0); - } - - // Acquire locks once for consistent state - let snapshot = self - .snapshot - .read() - .expect("Snapshot lock poisoned - indicates bug in concurrent access"); - let entries = self - .entries - .read() - .expect("Entries lock poisoned - indicates bug in concurrent access"); - - // Calculate bounds from locked state - let first = if snapshot.get_metadata().index > 0 { - snapshot.get_metadata().index + 1 - } else if !entries.is_empty() { - entries[0].index - } else { - 1 - }; - - let last = if let Some(last_entry) = entries.last() { - last_entry.index - } else { - snapshot.get_metadata().index - }; - - // Check if this is exactly the snapshot index - if index == snapshot.get_metadata().index { - return Ok(snapshot.get_metadata().term); - } - - // Check if index is before first available entry (compacted) - if index < first { - return Err(raft::Error::Store(StorageError::Compacted)); - } - - // Check if index is beyond available entries - if index > last { - return Err(raft::Error::Store(StorageError::Unavailable)); - } - - // Handle empty log (shouldn't happen given bounds checks, but be safe) - if entries.is_empty() { - return Err(raft::Error::Store(StorageError::Unavailable)); - } - - // Calculate offset - let offset = entries[0].index; - let vec_index = (index - offset) as usize; - - // Bounds check - if vec_index >= entries.len() { - return Err(raft::Error::Store(StorageError::Unavailable)); - } - - Ok(entries[vec_index].term) - } - - /// Returns the first index in the log. - /// - /// This is the index of the first entry available in the log. After log compaction, - /// this may be greater than 1 (the first entry that was ever appended). - /// - /// # Returns - /// - /// - If there's a snapshot, returns `snapshot.metadata.index + 1` - /// - Otherwise, returns 1 (the default first index) - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// - /// let storage = MemStorage::new(); - /// assert_eq!(storage.first_index().unwrap(), 1); - /// ``` - pub fn first_index(&self) -> raft::Result { - let snapshot = self - .snapshot - .read() - .expect("Snapshot lock poisoned - indicates bug in concurrent access"); - let entries = self - .entries - .read() - .expect("Entries lock poisoned - indicates bug in concurrent access"); - - if snapshot.get_metadata().index > 0 { - Ok(snapshot.get_metadata().index + 1) - } else if !entries.is_empty() { - Ok(entries[0].index) - } else { - Ok(1) - } - } - - /// Returns the last index in the log. - /// - /// This is the index of the last entry available in the log. - /// - /// # Returns - /// - /// - If there are entries, returns the index of the last entry - /// - If there's a snapshot but no entries, returns the snapshot index - /// - Otherwise, returns 0 (empty log) - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// - /// let storage = MemStorage::new(); - /// assert_eq!(storage.last_index().unwrap(), 0); - /// ``` - pub fn last_index(&self) -> raft::Result { - let entries = self - .entries - .read() - .expect("Entries lock poisoned - indicates bug in concurrent access"); - let snapshot = self - .snapshot - .read() - .expect("Snapshot lock poisoned - indicates bug in concurrent access"); - - if let Some(last) = entries.last() { - Ok(last.index) - } else { - Ok(snapshot.get_metadata().index) - } - } - - /// Returns the current snapshot. - /// - /// In Phase 1, this is simplified to always return the stored snapshot - /// regardless of the `request_index` parameter. In later phases, this - /// would check if the snapshot is ready for the given index. - /// - /// # Arguments - /// - /// * `request_index` - The index for which a snapshot is requested (unused in Phase 1) - /// - /// # Returns - /// - /// Returns a `Result` containing: - /// - `Ok(Snapshot)` - A clone of the current snapshot - /// - /// # Phase 1 Simplification - /// - /// This implementation ignores `request_index` and always returns the current - /// snapshot. Future phases may return `StorageError::SnapshotTemporarilyUnavailable` - /// if a snapshot is being created for a specific index. - /// - /// # Thread Safety - /// - /// This method acquires a read lock on the snapshot field. Multiple concurrent - /// calls are safe and efficient. - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// use raft::eraftpb::Snapshot; - /// - /// let storage = MemStorage::new(); - /// - /// // Empty storage returns default snapshot - /// let snapshot = storage.snapshot(0).unwrap(); - /// assert_eq!(snapshot.get_metadata().index, 0); - /// assert_eq!(snapshot.get_metadata().term, 0); - /// assert!(snapshot.data.is_empty()); - /// ``` - pub fn snapshot(&self, _request_index: u64) -> raft::Result { - // Phase 1: Simplified implementation - // Just return the current snapshot, ignoring request_index - let snapshot = self - .snapshot - .read() - .expect("Snapshot lock poisoned - indicates bug in concurrent access"); - Ok(snapshot.clone()) - } - - /// Appends entries to the log. - /// - /// This is a helper method for testing. In production use, entries are - /// typically appended through the Raft ready processing. - /// - /// # Arguments - /// - /// * `ents` - Slice of entries to append - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// use raft::eraftpb::Entry; - /// - /// let storage = MemStorage::new(); - /// let entries = vec![ - /// Entry { index: 1, term: 1, ..Default::default() }, - /// Entry { index: 2, term: 1, ..Default::default() }, - /// ]; - /// storage.append(&entries); - /// ``` - pub fn append(&self, ents: &[Entry]) { - let mut entries = self - .entries - .write() - .expect("Entries lock poisoned - indicates bug in concurrent access"); - entries.extend_from_slice(ents); - } - - /// Applies a snapshot to the storage. - /// - /// This method replaces the entire storage state with the given snapshot. - /// All log entries covered by the snapshot (entries with index <= snapshot.metadata.index) - /// are removed. The hard state and configuration state are updated from the snapshot metadata. - /// - /// # Arguments - /// - /// * `snapshot` - The snapshot to apply - /// - /// # Thread Safety - /// - /// This method acquires write locks on all storage fields. It is safe to call - /// concurrently with other methods, but write operations are serialized. - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// use raft::eraftpb::{Snapshot, ConfState}; - /// - /// let storage = MemStorage::new(); - /// - /// // Create a snapshot - /// let mut snapshot = Snapshot::default(); - /// snapshot.mut_metadata().index = 10; - /// snapshot.mut_metadata().term = 3; - /// snapshot.mut_metadata().conf_state = Some(ConfState { - /// voters: vec![1, 2, 3], - /// ..Default::default() - /// }); - /// snapshot.data = vec![1, 2, 3, 4, 5]; - /// - /// // Apply snapshot - /// storage.apply_snapshot(snapshot.clone()).unwrap(); - /// - /// // Verify snapshot was applied - /// let retrieved = storage.snapshot(0).unwrap(); - /// assert_eq!(retrieved.get_metadata().index, 10); - /// assert_eq!(retrieved.get_metadata().term, 3); - /// ``` - /// - /// # Errors - /// - /// Returns an error if: - /// - Lock acquisition fails (lock poisoning) - pub fn apply_snapshot(&self, snapshot: Snapshot) -> raft::Result<()> { - // Get snapshot index and term for updating hard_state - let snap_index = snapshot.get_metadata().index; - let snap_term = snapshot.get_metadata().term; - - // Acquire write locks in consistent order to prevent deadlocks - // Lock ordering: snapshot → entries → hard_state → conf_state (documented to prevent deadlocks) - let mut storage_snapshot = self - .snapshot - .write() - .expect("Snapshot lock poisoned - indicates bug in concurrent access"); - let mut entries = self - .entries - .write() - .expect("Entries lock poisoned - indicates bug in concurrent access"); - let mut hard_state = self - .hard_state - .write() - .expect("Hard state lock poisoned - indicates bug in concurrent access"); - let mut conf_state = self - .conf_state - .write() - .expect("Conf state lock poisoned - indicates bug in concurrent access"); - - // Replace snapshot - *storage_snapshot = snapshot.clone(); - - // Remove entries covered by the snapshot - // Keep only entries with index > snapshot.metadata.index - entries.retain(|entry| entry.index > snap_index); - - // Update hard_state commit to at least snapshot index - if hard_state.commit < snap_index { - hard_state.commit = snap_index; - } - // Update term if snapshot term is higher - if hard_state.term < snap_term { - hard_state.term = snap_term; - } - - // Update conf_state from snapshot metadata - if let Some(ref cs) = snapshot.get_metadata().conf_state { - *conf_state = cs.clone(); - } - - Ok(()) - } - - /// Appends entries to the log with proper conflict resolution. - /// - /// This method implements the Raft log append logic with truncation of conflicting - /// entries. If an incoming entry has the same index as an existing entry but a - /// different term, all entries from that point onwards are removed before appending - /// the new entries. - /// - /// # Arguments - /// - /// * `entries` - Slice of entries to append - /// - /// # Thread Safety - /// - /// This method acquires a write lock on the entries field. Multiple concurrent - /// calls are serialized. - /// - /// # Examples - /// - /// ``` - /// use seshat_raft::MemStorage; - /// use raft::eraftpb::Entry; - /// - /// let storage = MemStorage::new(); - /// - /// // Append initial entries - /// let entries1 = vec![ - /// Entry { index: 1, term: 1, ..Default::default() }, - /// Entry { index: 2, term: 1, ..Default::default() }, - /// Entry { index: 3, term: 1, ..Default::default() }, - /// ]; - /// storage.wl_append_entries(&entries1).unwrap(); - /// assert_eq!(storage.last_index().unwrap(), 3); - /// - /// // Append conflicting entries (will truncate from index 2) - /// let entries2 = vec![ - /// Entry { index: 2, term: 2, ..Default::default() }, - /// Entry { index: 3, term: 2, ..Default::default() }, - /// ]; - /// storage.wl_append_entries(&entries2).unwrap(); - /// assert_eq!(storage.last_index().unwrap(), 3); - /// assert_eq!(storage.term(2).unwrap(), 2); - /// assert_eq!(storage.term(3).unwrap(), 2); - /// ``` - /// - /// # Errors - /// - /// Returns an error if: - /// - Lock acquisition fails (lock poisoning) - pub fn wl_append_entries(&self, entries: &[Entry]) -> raft::Result<()> { - // Empty entries slice is valid - just return - if entries.is_empty() { - return Ok(()); - } - - // Acquire write lock on entries - let mut storage_entries = self - .entries - .write() - .expect("Entries lock poisoned - indicates bug in concurrent access"); - - // If storage is empty, just append all entries - if storage_entries.is_empty() { - storage_entries.extend_from_slice(entries); - return Ok(()); - } - - // Find the first conflicting entry - let first_new_index = entries[0].index; - let storage_offset = storage_entries[0].index; - - // If new entries start after our log, just append - // Note: storage_entries is guaranteed non-empty by check above - if first_new_index - > storage_entries - .last() - .expect("Storage entries non-empty - checked above") - .index - { - storage_entries.extend_from_slice(entries); - return Ok(()); - } - - // If new entries start before our log, we need to handle overlap - if first_new_index < storage_offset { - // New entries start before our log - this shouldn't happen normally - // but we'll handle it by clearing everything and appending - storage_entries.clear(); - storage_entries.extend_from_slice(entries); - return Ok(()); - } - - // Find conflict point - for (i, entry) in entries.iter().enumerate() { - let storage_idx = (entry.index - storage_offset) as usize; - - // If this entry is beyond our current log, append remaining entries - if storage_idx >= storage_entries.len() { - storage_entries.extend_from_slice(&entries[i..]); - return Ok(()); - } - - // Check for conflict - if storage_entries[storage_idx].term != entry.term { - // Found conflict - truncate from this point and append new entries - storage_entries.truncate(storage_idx); - storage_entries.extend_from_slice(&entries[i..]); - return Ok(()); - } - - // Terms match - this entry is already in the log, continue checking - } - - Ok(()) - } -} - -impl Default for MemStorage { - fn default() -> Self { - Self::new() - } -} - -impl raft::Storage for MemStorage { - fn initial_state(&self) -> raft::Result { - self.initial_state() - } - - fn entries( - &self, - low: u64, - high: u64, - max_size: impl Into>, - _context: raft::GetEntriesContext, - ) -> raft::Result> { - self.entries(low, high, max_size.into()) - } - - fn term(&self, idx: u64) -> raft::Result { - self.term(idx) - } - - fn first_index(&self) -> raft::Result { - self.first_index() - } - - fn last_index(&self) -> raft::Result { - self.last_index() - } - - fn snapshot(&self, request_index: u64, _to: u64) -> raft::Result { - self.snapshot(request_index) - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::sync::Arc; - use std::thread; - - #[test] - fn test_mem_storage_new_creates_successfully() { - let storage = MemStorage::new(); - - // Verify storage was created without panicking - // We can't directly access the fields since they're private, - // but we can verify the storage exists - let _debug_output = format!("{storage:?}"); - } - - #[test] - fn test_mem_storage_default_creates_successfully() { - let storage = MemStorage::default(); - - // Verify default() works the same as new() - let _debug_output = format!("{storage:?}"); - } - - #[test] - fn test_mem_storage_has_default_hard_state() { - let storage = MemStorage::new(); - - // Access hard_state to verify it's initialized - let hard_state = storage.hard_state.read().unwrap(); - assert_eq!(hard_state.term, 0, "Initial term should be 0"); - assert_eq!(hard_state.vote, 0, "Initial vote should be 0"); - assert_eq!(hard_state.commit, 0, "Initial commit should be 0"); - } - - #[test] - fn test_mem_storage_has_default_conf_state() { - let storage = MemStorage::new(); - - // Access conf_state to verify it's initialized - let conf_state = storage.conf_state.read().unwrap(); - assert!( - conf_state.voters.is_empty(), - "Initial voters should be empty" - ); - assert!( - conf_state.learners.is_empty(), - "Initial learners should be empty" - ); - } - - #[test] - fn test_mem_storage_has_empty_entries() { - let storage = MemStorage::new(); - - // Access entries to verify it's an empty vector - let entries = storage.entries.read().unwrap(); - assert!(entries.is_empty(), "Initial entries should be empty"); - assert_eq!(entries.len(), 0, "Initial entries length should be 0"); - } - - #[test] - fn test_mem_storage_has_default_snapshot() { - let storage = MemStorage::new(); - - // Access snapshot to verify it's initialized - let snapshot = storage.snapshot.read().unwrap(); - assert!( - snapshot.data.is_empty(), - "Initial snapshot data should be empty" - ); - } - - #[test] - fn test_mem_storage_fields_are_thread_safe() { - let storage = MemStorage::new(); - - // Verify we can get read locks on all fields - let _hard_state = storage.hard_state.read().unwrap(); - let _conf_state = storage.conf_state.read().unwrap(); - let _entries = storage.entries.read().unwrap(); - let _snapshot = storage.snapshot.read().unwrap(); - - // All locks should be released when the guards go out of scope - } - - #[test] - fn test_mem_storage_multiple_readers() { - let storage = MemStorage::new(); - - // Verify multiple readers can access simultaneously - let _lock1 = storage.hard_state.read().unwrap(); - let _lock2 = storage.hard_state.read().unwrap(); - let _lock3 = storage.hard_state.read().unwrap(); - - // All read locks should coexist - } - - #[test] - fn test_mem_storage_write_lock() { - let storage = MemStorage::new(); - - // Verify we can get write locks - { - let mut hard_state = storage.hard_state.write().unwrap(); - hard_state.term = 1; - } - - // Verify the write persisted - let hard_state = storage.hard_state.read().unwrap(); - assert_eq!(hard_state.term, 1); - } - - #[test] - fn test_mem_storage_is_send() { - fn assert_send() {} - assert_send::(); - } - - #[test] - fn test_mem_storage_is_sync() { - fn assert_sync() {} - assert_sync::(); - } - - #[test] - fn test_mem_storage_can_be_used_across_threads() { - let storage = Arc::new(MemStorage::new()); - let storage_clone = Arc::clone(&storage); - - let handle = thread::spawn(move || { - let hard_state = storage_clone.hard_state.read().unwrap(); - assert_eq!(hard_state.term, 0); - }); - - handle.join().unwrap(); - } - - #[test] - fn test_mem_storage_independent_instances() { - let storage1 = MemStorage::new(); - let storage2 = MemStorage::new(); - - // Modify storage1 - { - let mut hard_state = storage1.hard_state.write().unwrap(); - hard_state.term = 5; - } - - // Verify storage2 is unaffected - let hard_state2 = storage2.hard_state.read().unwrap(); - assert_eq!(hard_state2.term, 0); - } - - // ============================================================================ - // Tests for initial_state() method - // ============================================================================ - - #[test] - fn test_initial_state_returns_defaults() { - let storage = MemStorage::new(); - - let state = storage - .initial_state() - .expect("initial_state should succeed"); - - // Verify default HardState - assert_eq!(state.hard_state.term, 0, "Default term should be 0"); - assert_eq!(state.hard_state.vote, 0, "Default vote should be 0"); - assert_eq!(state.hard_state.commit, 0, "Default commit should be 0"); - - // Verify default ConfState - assert!( - state.conf_state.voters.is_empty(), - "Default voters should be empty" - ); - assert!( - state.conf_state.learners.is_empty(), - "Default learners should be empty" - ); - } - - #[test] - fn test_initial_state_reflects_hard_state_changes() { - let storage = MemStorage::new(); - - // Modify hard_state - let new_hard_state = HardState { - term: 10, - vote: 3, - commit: 25, - }; - storage.set_hard_state(new_hard_state); - - // Verify initial_state reflects the change - let state = storage - .initial_state() - .expect("initial_state should succeed"); - assert_eq!(state.hard_state.term, 10, "Term should be updated to 10"); - assert_eq!(state.hard_state.vote, 3, "Vote should be updated to 3"); - assert_eq!( - state.hard_state.commit, 25, - "Commit should be updated to 25" - ); - } - - #[test] - fn test_initial_state_reflects_conf_state_changes() { - let storage = MemStorage::new(); - - // Modify conf_state - let new_conf_state = ConfState { - voters: vec![1, 2, 3], - learners: vec![4, 5], - ..Default::default() - }; - storage.set_conf_state(new_conf_state); - - // Verify initial_state reflects the change - let state = storage - .initial_state() - .expect("initial_state should succeed"); - assert_eq!( - state.conf_state.voters, - vec![1, 2, 3], - "Voters should be updated" - ); - assert_eq!( - state.conf_state.learners, - vec![4, 5], - "Learners should be updated" - ); - } - - #[test] - fn test_initial_state_is_thread_safe() { - let storage = Arc::new(MemStorage::new()); - - // Set initial values - let hs = HardState { - term: 5, - vote: 2, - commit: 10, - }; - storage.set_hard_state(hs); - - let cs = ConfState { - voters: vec![1, 2, 3], - ..Default::default() - }; - storage.set_conf_state(cs); - - // Spawn multiple threads calling initial_state - let handles: Vec<_> = (0..10) - .map(|_| { - let storage_clone = Arc::clone(&storage); - thread::spawn(move || { - let state = storage_clone - .initial_state() - .expect("initial_state should succeed"); - assert_eq!(state.hard_state.term, 5); - assert_eq!(state.hard_state.vote, 2); - assert_eq!(state.hard_state.commit, 10); - assert_eq!(state.conf_state.voters, vec![1, 2, 3]); - }) - }) - .collect(); - - // Wait for all threads to complete - for handle in handles { - handle.join().expect("Thread should not panic"); - } - } - - #[test] - fn test_initial_state_returns_cloned_data() { - let storage = MemStorage::new(); - - // Get initial state - let state1 = storage - .initial_state() - .expect("initial_state should succeed"); - - // Modify storage - let new_hard_state = HardState { - term: 100, - ..Default::default() - }; - storage.set_hard_state(new_hard_state); - - // Get initial state again - let state2 = storage - .initial_state() - .expect("initial_state should succeed"); - - // Verify state1 is independent of the change - assert_eq!( - state1.hard_state.term, 0, - "First state should not be affected by later changes" - ); - assert_eq!( - state2.hard_state.term, 100, - "Second state should reflect the change" - ); - } - - #[test] - fn test_initial_state_multiple_calls_are_consistent() { - let storage = MemStorage::new(); - - // Set specific values - let hs = HardState { - term: 42, - vote: 7, - commit: 99, - }; - storage.set_hard_state(hs); - - // Call initial_state multiple times - for _ in 0..100 { - let state = storage - .initial_state() - .expect("initial_state should succeed"); - assert_eq!(state.hard_state.term, 42); - assert_eq!(state.hard_state.vote, 7); - assert_eq!(state.hard_state.commit, 99); - } - } - - #[test] - fn test_set_hard_state_updates_storage() { - let storage = MemStorage::new(); - - // Create and set a new hard state - let hs = HardState { - term: 15, - vote: 8, - commit: 50, - }; - storage.set_hard_state(hs); - - // Verify the update by reading directly - let stored_hs = storage.hard_state.read().unwrap(); - assert_eq!(stored_hs.term, 15); - assert_eq!(stored_hs.vote, 8); - assert_eq!(stored_hs.commit, 50); - } - - #[test] - fn test_set_conf_state_updates_storage() { - let storage = MemStorage::new(); - - // Create and set a new conf state - let cs = ConfState { - voters: vec![10, 20, 30], - learners: vec![40], - ..Default::default() - }; - storage.set_conf_state(cs); - - // Verify the update by reading directly - let stored_cs = storage.conf_state.read().unwrap(); - assert_eq!(stored_cs.voters, vec![10, 20, 30]); - assert_eq!(stored_cs.learners, vec![40]); - } - - #[test] - fn test_initial_state_with_empty_conf_state() { - let storage = MemStorage::new(); - - // Set only hard state, leave conf state empty - let hs = HardState { - term: 1, - ..Default::default() - }; - storage.set_hard_state(hs); - - let state = storage - .initial_state() - .expect("initial_state should succeed"); - assert_eq!(state.hard_state.term, 1); - assert!(state.conf_state.voters.is_empty()); - assert!(state.conf_state.learners.is_empty()); - } - - #[test] - fn test_initial_state_with_complex_conf_state() { - let storage = MemStorage::new(); - - // Create a complex configuration - let cs = ConfState { - voters: vec![1, 2, 3, 4, 5], - learners: vec![6, 7], - voters_outgoing: vec![1, 2, 3], // During configuration change - learners_next: vec![8], // Learners being added - auto_leave: true, - }; - storage.set_conf_state(cs.clone()); - - let state = storage - .initial_state() - .expect("initial_state should succeed"); - assert_eq!(state.conf_state.voters, cs.voters); - assert_eq!(state.conf_state.learners, cs.learners); - assert_eq!(state.conf_state.voters_outgoing, cs.voters_outgoing); - assert_eq!(state.conf_state.learners_next, cs.learners_next); - assert_eq!(state.conf_state.auto_leave, cs.auto_leave); - } - - // ============================================================================ - // Tests for entries() method - // ============================================================================ - - #[test] - fn test_entries_empty_range_returns_empty_vec() { - let storage = MemStorage::new(); - - // Query with low == high should return empty vector - let result = storage.entries(1, 1, None); - assert!(result.is_ok(), "Empty range should succeed"); - assert_eq!( - result.unwrap().len(), - 0, - "Empty range should return no entries" - ); - } - - #[test] - fn test_entries_empty_range_on_populated_storage() { - let storage = MemStorage::new(); - - // Add some entries - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 1, - ..Default::default() - }, - ]; - storage.append(&entries); - - // Query with low == high should still return empty - let result = storage.entries(2, 2, None); - assert!(result.is_ok(), "Empty range should succeed"); - assert_eq!( - result.unwrap().len(), - 0, - "Empty range should return no entries" - ); - } - - #[test] - fn test_entries_normal_range_returns_correct_entries() { - let storage = MemStorage::new(); - - // Add entries with indices 1, 2, 3, 4, 5 - let entries = vec![ - Entry { - index: 1, - term: 1, - data: vec![1], - ..Default::default() - }, - Entry { - index: 2, - term: 1, - data: vec![2], - ..Default::default() - }, - Entry { - index: 3, - term: 2, - data: vec![3], - ..Default::default() - }, - Entry { - index: 4, - term: 2, - data: vec![4], - ..Default::default() - }, - Entry { - index: 5, - term: 3, - data: vec![5], - ..Default::default() - }, - ]; - storage.append(&entries); - - // Query range [2, 5) should return entries 2, 3, 4 - let result = storage.entries(2, 5, None); - assert!(result.is_ok(), "Valid range should succeed"); - - let returned = result.unwrap(); - assert_eq!(returned.len(), 3, "Should return 3 entries"); - assert_eq!(returned[0].index, 2, "First entry should have index 2"); - assert_eq!(returned[1].index, 3, "Second entry should have index 3"); - assert_eq!(returned[2].index, 4, "Third entry should have index 4"); - assert_eq!(returned[0].data, vec![2], "First entry data should match"); - assert_eq!(returned[1].data, vec![3], "Second entry data should match"); - assert_eq!(returned[2].data, vec![4], "Third entry data should match"); - } - - #[test] - fn test_entries_single_entry_range() { - let storage = MemStorage::new(); - - let entries = vec![ - Entry { - index: 1, - term: 1, - data: vec![1], - ..Default::default() - }, - Entry { - index: 2, - term: 1, - data: vec![2], - ..Default::default() - }, - Entry { - index: 3, - term: 2, - data: vec![3], - ..Default::default() - }, - ]; - storage.append(&entries); - - // Query single entry [2, 3) - let result = storage.entries(2, 3, None); - assert!(result.is_ok(), "Single entry range should succeed"); - - let returned = result.unwrap(); - assert_eq!(returned.len(), 1, "Should return 1 entry"); - assert_eq!(returned[0].index, 2, "Entry should have index 2"); - assert_eq!(returned[0].data, vec![2], "Entry data should match"); - } - - #[test] - fn test_entries_full_range() { - let storage = MemStorage::new(); - - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.append(&entries); - - // Query all entries [1, 4) - let result = storage.entries(1, 4, None); - assert!(result.is_ok(), "Full range should succeed"); - - let returned = result.unwrap(); - assert_eq!(returned.len(), 3, "Should return all 3 entries"); - assert_eq!(returned[0].index, 1); - assert_eq!(returned[1].index, 2); - assert_eq!(returned[2].index, 3); - } - - #[test] - fn test_entries_with_max_size_returns_partial_results() { - let storage = MemStorage::new(); - - // Create entries with specific sizes - // Each entry has some overhead, so we'll use data to control size - let entries = vec![ - Entry { - index: 1, - term: 1, - data: vec![0; 100], - ..Default::default() - }, - Entry { - index: 2, - term: 1, - data: vec![0; 100], - ..Default::default() - }, - Entry { - index: 3, - term: 2, - data: vec![0; 100], - ..Default::default() - }, - Entry { - index: 4, - term: 2, - data: vec![0; 100], - ..Default::default() - }, - ]; - storage.append(&entries); - - // Request range [1, 5) with size limit that fits only first 2 entries - // Each entry is roughly 100+ bytes, so max_size of 250 should get us 2 entries - let result = storage.entries(1, 5, Some(250)); - assert!(result.is_ok(), "Size-limited query should succeed"); - - let returned = result.unwrap(); - assert!( - !returned.is_empty() && returned.len() < 4, - "Should return partial results (got {} entries)", - returned.len() - ); - assert_eq!(returned[0].index, 1, "First entry should have index 1"); - } - - #[test] - fn test_entries_with_max_size_returns_at_least_one_entry() { - let storage = MemStorage::new(); - - // Create entry larger than max_size - let entries = vec![ - Entry { - index: 1, - term: 1, - data: vec![0; 1000], - ..Default::default() - }, - Entry { - index: 2, - term: 1, - data: vec![0; 1000], - ..Default::default() - }, - ]; - storage.append(&entries); - - // Request with very small max_size - should still return at least first entry - let result = storage.entries(1, 3, Some(10)); - assert!(result.is_ok(), "Should succeed even with small max_size"); - - let returned = result.unwrap(); - assert_eq!(returned.len(), 1, "Should return at least one entry"); - assert_eq!(returned[0].index, 1, "Should return first entry"); - } - - #[test] - fn test_entries_error_when_low_less_than_first_index() { - let storage = MemStorage::new(); - - // Create a snapshot at index 5 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 5; - snapshot.mut_metadata().term = 2; - *storage.snapshot.write().unwrap() = snapshot; - - // Add entries starting from index 6 - let entries = vec![ - Entry { - index: 6, - term: 2, - ..Default::default() - }, - Entry { - index: 7, - term: 3, - ..Default::default() - }, - ]; - storage.append(&entries); - - // first_index() should be 6 (snapshot.index + 1) - // Requesting entries before that should fail - let result = storage.entries(4, 7, None); - assert!(result.is_err(), "Should error when low < first_index"); - - match result.unwrap_err() { - raft::Error::Store(StorageError::Compacted) => { - // Expected error - } - other => panic!("Expected StorageError::Compacted, got {other:?}"), - } - } - - #[test] - fn test_entries_error_when_high_greater_than_last_index_plus_one() { - let storage = MemStorage::new(); - - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.append(&entries); - - // last_index() is 3, so high can be at most 4 (last_index + 1) - // Requesting high > 4 should fail - let result = storage.entries(1, 5, None); - assert!(result.is_err(), "Should error when high > last_index + 1"); - - match result.unwrap_err() { - raft::Error::Store(StorageError::Unavailable) => { - // Expected error - } - other => panic!("Expected StorageError::Unavailable, got {other:?}"), - } - } - - #[test] - fn test_entries_boundary_at_last_index_plus_one() { - let storage = MemStorage::new(); - - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.append(&entries); - - // last_index() is 3, so high = 4 (last_index + 1) should be valid - let result = storage.entries(1, 4, None); - assert!(result.is_ok(), "high = last_index + 1 should be valid"); - - let returned = result.unwrap(); - assert_eq!(returned.len(), 3, "Should return all entries"); - } - - #[test] - fn test_entries_on_empty_storage() { - let storage = MemStorage::new(); - - // Empty storage: first_index = 1, last_index = 0 - // Valid range should be [1, 1) which returns empty - let result = storage.entries(1, 1, None); - assert!( - result.is_ok(), - "Empty range on empty storage should succeed" - ); - assert_eq!(result.unwrap().len(), 0); - - // Any request with high > 1 should fail (unavailable) - let result = storage.entries(1, 2, None); - assert!( - result.is_err(), - "Should error when requesting unavailable entries" - ); - - match result.unwrap_err() { - raft::Error::Store(StorageError::Unavailable) => { - // Expected - } - other => panic!("Expected StorageError::Unavailable, got {other:?}"), - } - } - - #[test] - fn test_entries_thread_safe() { - let storage = Arc::new(MemStorage::new()); - - // Populate storage - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - Entry { - index: 4, - term: 2, - ..Default::default() - }, - Entry { - index: 5, - term: 3, - ..Default::default() - }, - ]; - storage.append(&entries); - - // Spawn multiple threads reading concurrently - let handles: Vec<_> = (0..10) - .map(|_| { - let storage_clone = Arc::clone(&storage); - thread::spawn(move || { - let result = storage_clone.entries(2, 4, None); - assert!(result.is_ok()); - let returned = result.unwrap(); - assert_eq!(returned.len(), 2); - assert_eq!(returned[0].index, 2); - assert_eq!(returned[1].index, 3); - }) - }) - .collect(); - - for handle in handles { - handle.join().expect("Thread should not panic"); - } - } - - // ============================================================================ - // Tests for term() method - // ============================================================================ - - #[test] - fn test_term_index_zero_returns_zero() { - let storage = MemStorage::new(); - - // Index 0 should always return term 0 - let result = storage.term(0); - assert!(result.is_ok(), "term(0) should succeed"); - assert_eq!(result.unwrap(), 0, "term(0) should return 0"); - } - - #[test] - fn test_term_for_valid_indices_in_log() { - let storage = MemStorage::new(); - - // Add entries with different terms - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - Entry { - index: 4, - term: 3, - ..Default::default() - }, - Entry { - index: 5, - term: 3, - ..Default::default() - }, - ]; - storage.append(&entries); - - // Test term for each entry - assert_eq!(storage.term(1).unwrap(), 1, "Entry 1 should have term 1"); - assert_eq!(storage.term(2).unwrap(), 1, "Entry 2 should have term 1"); - assert_eq!(storage.term(3).unwrap(), 2, "Entry 3 should have term 2"); - assert_eq!(storage.term(4).unwrap(), 3, "Entry 4 should have term 3"); - assert_eq!(storage.term(5).unwrap(), 3, "Entry 5 should have term 3"); - } - - #[test] - fn test_term_for_snapshot_index() { - let storage = MemStorage::new(); - - // Create a snapshot at index 5 with term 2 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 5; - snapshot.mut_metadata().term = 2; - *storage.snapshot.write().unwrap() = snapshot; - - // Add entries starting from index 6 - let entries = vec![ - Entry { - index: 6, - term: 2, - ..Default::default() - }, - Entry { - index: 7, - term: 3, - ..Default::default() - }, - ]; - storage.append(&entries); - - // Query term for snapshot index should return snapshot term - let result = storage.term(5); - assert!(result.is_ok(), "term(snapshot_index) should succeed"); - assert_eq!(result.unwrap(), 2, "Should return snapshot term"); - } - - #[test] - fn test_term_error_for_compacted_index() { - let storage = MemStorage::new(); - - // Create a snapshot at index 5 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 5; - snapshot.mut_metadata().term = 2; - *storage.snapshot.write().unwrap() = snapshot; - - // Add entries starting from index 6 - let entries = vec![ - Entry { - index: 6, - term: 2, - ..Default::default() - }, - Entry { - index: 7, - term: 3, - ..Default::default() - }, - ]; - storage.append(&entries); - - // first_index() should be 6 (snapshot.index + 1) - // Requesting term for index before that should fail - let result = storage.term(4); - assert!(result.is_err(), "Should error for compacted index"); - - match result.unwrap_err() { - raft::Error::Store(StorageError::Compacted) => { - // Expected error - } - other => panic!("Expected StorageError::Compacted, got {other:?}"), - } - } - - #[test] - fn test_term_error_for_unavailable_index() { - let storage = MemStorage::new(); - - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.append(&entries); - - // last_index() is 3 - // Requesting term for index > 3 should fail - let result = storage.term(4); - assert!(result.is_err(), "Should error for unavailable index"); - - match result.unwrap_err() { - raft::Error::Store(StorageError::Unavailable) => { - // Expected error - } - other => panic!("Expected StorageError::Unavailable, got {other:?}"), - } - } - - #[test] - fn test_term_on_empty_storage() { - let storage = MemStorage::new(); - - // Index 0 should work - assert_eq!(storage.term(0).unwrap(), 0, "term(0) should return 0"); - - // Any positive index should fail with Unavailable - let result = storage.term(1); - assert!(result.is_err(), "Should error for index beyond empty log"); - - match result.unwrap_err() { - raft::Error::Store(StorageError::Unavailable) => { - // Expected - } - other => panic!("Expected StorageError::Unavailable, got {other:?}"), - } - } - - #[test] - fn test_term_thread_safety() { - let storage = Arc::new(MemStorage::new()); - - // Populate storage - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 2, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - Entry { - index: 4, - term: 3, - ..Default::default() - }, - Entry { - index: 5, - term: 3, - ..Default::default() - }, - ]; - storage.append(&entries); - - // Spawn multiple threads reading terms concurrently - let handles: Vec<_> = (0..10) - .map(|_| { - let storage_clone = Arc::clone(&storage); - thread::spawn(move || { - assert_eq!(storage_clone.term(0).unwrap(), 0); - assert_eq!(storage_clone.term(1).unwrap(), 1); - assert_eq!(storage_clone.term(2).unwrap(), 2); - assert_eq!(storage_clone.term(3).unwrap(), 2); - assert_eq!(storage_clone.term(4).unwrap(), 3); - assert_eq!(storage_clone.term(5).unwrap(), 3); - }) - }) - .collect(); - - for handle in handles { - handle.join().expect("Thread should not panic"); - } - } - - #[test] - fn test_term_boundary_conditions() { - let storage = MemStorage::new(); - - // Add a single entry - let entries = vec![Entry { - index: 1, - term: 5, - ..Default::default() - }]; - storage.append(&entries); - - // Test boundaries - assert_eq!(storage.term(0).unwrap(), 0, "Index 0 returns 0"); - assert_eq!(storage.term(1).unwrap(), 5, "Index 1 returns correct term"); - - // Index 2 should be unavailable - let result = storage.term(2); - assert!(result.is_err(), "Index beyond last should error"); - match result.unwrap_err() { - raft::Error::Store(StorageError::Unavailable) => { - // Expected - } - other => panic!("Expected StorageError::Unavailable, got {other:?}"), - } - } - - #[test] - fn test_term_with_snapshot_but_no_entries() { - let storage = MemStorage::new(); - - // Create a snapshot at index 10 with term 5 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 10; - snapshot.mut_metadata().term = 5; - *storage.snapshot.write().unwrap() = snapshot; - - // No entries added, only snapshot exists - - // Index 0 should work - assert_eq!(storage.term(0).unwrap(), 0, "Index 0 returns 0"); - - // Snapshot index should return snapshot term - assert_eq!( - storage.term(10).unwrap(), - 5, - "Snapshot index returns snapshot term" - ); - - // Indices before snapshot should be compacted - let result = storage.term(9); - assert!(result.is_err(), "Index before snapshot should be compacted"); - match result.unwrap_err() { - raft::Error::Store(StorageError::Compacted) => { - // Expected - } - other => panic!("Expected StorageError::Compacted, got {other:?}"), - } - - // Indices after snapshot should be unavailable - let result = storage.term(11); - assert!( - result.is_err(), - "Index after snapshot should be unavailable" - ); - match result.unwrap_err() { - raft::Error::Store(StorageError::Unavailable) => { - // Expected - } - other => panic!("Expected StorageError::Unavailable, got {other:?}"), - } - } - - // ============================================================================ - // Tests for first_index() method - // ============================================================================ - - #[test] - fn test_first_index_empty_log() { - let storage = MemStorage::new(); - - // Empty log should return 1 as the default first index - let result = storage.first_index(); - assert!(result.is_ok(), "first_index should succeed on empty log"); - assert_eq!(result.unwrap(), 1, "Empty log should have first_index = 1"); - } - - #[test] - fn test_first_index_after_append() { - let storage = MemStorage::new(); - - // Append entries starting at index 1 - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.append(&entries); - - let result = storage.first_index(); - assert!(result.is_ok(), "first_index should succeed"); - assert_eq!( - result.unwrap(), - 1, - "first_index should be 1 when entries start at 1" - ); - } - - #[test] - fn test_first_index_with_snapshot() { - let storage = MemStorage::new(); - - // Create a snapshot at index 10 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 10; - snapshot.mut_metadata().term = 3; - *storage.snapshot.write().unwrap() = snapshot; - - // No entries yet, first_index should be snapshot.index + 1 - let result = storage.first_index(); - assert!(result.is_ok(), "first_index should succeed with snapshot"); - assert_eq!( - result.unwrap(), - 11, - "first_index should be snapshot.index + 1" - ); - } - - #[test] - fn test_first_index_with_snapshot_and_entries() { - let storage = MemStorage::new(); - - // Create a snapshot at index 10 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 10; - snapshot.mut_metadata().term = 3; - *storage.snapshot.write().unwrap() = snapshot; - - // Add entries starting from index 11 - let entries = vec![ - Entry { - index: 11, - term: 3, - ..Default::default() - }, - Entry { - index: 12, - term: 3, - ..Default::default() - }, - Entry { - index: 13, - term: 4, - ..Default::default() - }, - ]; - storage.append(&entries); - - // first_index should still be snapshot.index + 1 - let result = storage.first_index(); - assert!(result.is_ok(), "first_index should succeed"); - assert_eq!( - result.unwrap(), - 11, - "first_index should be snapshot.index + 1 even with entries" - ); - } - - #[test] - fn test_first_index_after_compaction() { - let storage = MemStorage::new(); - - // Simulate log compaction by: - // 1. Creating a snapshot at index 50 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 50; - snapshot.mut_metadata().term = 10; - *storage.snapshot.write().unwrap() = snapshot; - - // 2. Adding new entries after the snapshot - let entries = vec![ - Entry { - index: 51, - term: 10, - ..Default::default() - }, - Entry { - index: 52, - term: 11, - ..Default::default() - }, - ]; - storage.append(&entries); - - let result = storage.first_index(); - assert!( - result.is_ok(), - "first_index should succeed after compaction" - ); - assert_eq!( - result.unwrap(), - 51, - "first_index should be 51 after compaction at index 50" - ); - } - - #[test] - fn test_first_index_with_entries_not_starting_at_one() { - let storage = MemStorage::new(); - - // Directly append entries that don't start at index 1 - // (simulating entries after compaction) - let entries = vec![ - Entry { - index: 20, - term: 5, - ..Default::default() - }, - Entry { - index: 21, - term: 5, - ..Default::default() - }, - ]; - storage.append(&entries); - - // Without a snapshot, first_index should return the first entry's index - let result = storage.first_index(); - assert!(result.is_ok(), "first_index should succeed"); - assert_eq!( - result.unwrap(), - 20, - "first_index should match first entry index" - ); - } - - // ============================================================================ - // Tests for last_index() method - // ============================================================================ - - #[test] - fn test_last_index_empty_log() { - let storage = MemStorage::new(); - - // Empty log should return 0 as the last index - let result = storage.last_index(); - assert!(result.is_ok(), "last_index should succeed on empty log"); - assert_eq!(result.unwrap(), 0, "Empty log should have last_index = 0"); - } - - #[test] - fn test_last_index_after_append() { - let storage = MemStorage::new(); - - // Append entries - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.append(&entries); - - let result = storage.last_index(); - assert!(result.is_ok(), "last_index should succeed"); - assert_eq!( - result.unwrap(), - 3, - "last_index should be the index of the last entry" - ); - } - - #[test] - fn test_last_index_snapshot_only() { - let storage = MemStorage::new(); - - // Create a snapshot at index 10, no entries - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 10; - snapshot.mut_metadata().term = 3; - *storage.snapshot.write().unwrap() = snapshot; - - // With no entries, last_index should return snapshot.index - let result = storage.last_index(); - assert!( - result.is_ok(), - "last_index should succeed with snapshot only" - ); - assert_eq!( - result.unwrap(), - 10, - "last_index should be snapshot.index when no entries exist" - ); - } - - #[test] - fn test_last_index_with_snapshot_and_entries() { - let storage = MemStorage::new(); - - // Create a snapshot at index 10 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 10; - snapshot.mut_metadata().term = 3; - *storage.snapshot.write().unwrap() = snapshot; - - // Add entries after the snapshot - let entries = vec![ - Entry { - index: 11, - term: 3, - ..Default::default() - }, - Entry { - index: 12, - term: 3, - ..Default::default() - }, - Entry { - index: 13, - term: 4, - ..Default::default() - }, - ]; - storage.append(&entries); - - // last_index should return the last entry's index, not the snapshot - let result = storage.last_index(); - assert!(result.is_ok(), "last_index should succeed"); - assert_eq!( - result.unwrap(), - 13, - "last_index should be the last entry index, not snapshot index" - ); - } - - #[test] - fn test_last_index_after_multiple_appends() { - let storage = MemStorage::new(); - - // First append - let entries1 = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - ]; - storage.append(&entries1); - - assert_eq!( - storage.last_index().unwrap(), - 2, - "After first append, last_index should be 2" - ); - - // Second append - let entries2 = vec![ - Entry { - index: 3, - term: 2, - ..Default::default() - }, - Entry { - index: 4, - term: 2, - ..Default::default() - }, - Entry { - index: 5, - term: 3, - ..Default::default() - }, - ]; - storage.append(&entries2); - - assert_eq!( - storage.last_index().unwrap(), - 5, - "After second append, last_index should be 5" - ); - } - - #[test] - fn test_last_index_single_entry() { - let storage = MemStorage::new(); - - // Append a single entry - let entries = vec![Entry { - index: 1, - term: 1, - ..Default::default() - }]; - storage.append(&entries); - - let result = storage.last_index(); - assert!( - result.is_ok(), - "last_index should succeed with single entry" - ); - assert_eq!( - result.unwrap(), - 1, - "last_index should be 1 for single entry" - ); - } - - // ============================================================================ - // Tests for first_index() and last_index() invariants - // ============================================================================ - - #[test] - fn test_first_last_index_invariant() { - // Test the invariant: first_index <= last_index + 1 - // This should hold in all valid states - - let storage = MemStorage::new(); - - // Case 1: Empty log - let first = storage.first_index().unwrap(); - let last = storage.last_index().unwrap(); - assert!( - first <= last + 1, - "Empty log: first_index ({first}) <= last_index ({last}) + 1" - ); - - // Case 2: After appending entries - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.append(&entries); - - let first = storage.first_index().unwrap(); - let last = storage.last_index().unwrap(); - assert!( - first <= last + 1, - "With entries: first_index ({first}) <= last_index ({last}) + 1" - ); - - // Case 3: With snapshot (need to clear old entries to simulate proper compaction) - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 10; - snapshot.mut_metadata().term = 3; - *storage.snapshot.write().unwrap() = snapshot; - // Clear old entries that are covered by the snapshot - storage.entries.write().unwrap().clear(); - - let first = storage.first_index().unwrap(); - let last = storage.last_index().unwrap(); - assert!( - first <= last + 1, - "With snapshot: first_index ({first}) <= last_index ({last}) + 1" - ); - - // Case 4: With snapshot and new entries - let entries = vec![ - Entry { - index: 11, - term: 3, - ..Default::default() - }, - Entry { - index: 12, - term: 4, - ..Default::default() - }, - ]; - storage.append(&entries); - - let first = storage.first_index().unwrap(); - let last = storage.last_index().unwrap(); - assert!( - first <= last + 1, - "With snapshot and entries: first_index ({first}) <= last_index ({last}) + 1" - ); - } - - #[test] - fn test_first_last_index_boundaries() { - let storage = MemStorage::new(); - - // Empty log special case - assert_eq!(storage.first_index().unwrap(), 1); - assert_eq!(storage.last_index().unwrap(), 0); - // This is the one case where first > last, but first <= last + 1 still holds - - // Single entry - storage.append(&[Entry { - index: 1, - term: 1, - ..Default::default() - }]); - assert_eq!(storage.first_index().unwrap(), 1); - assert_eq!(storage.last_index().unwrap(), 1); - - // Multiple entries - storage.append(&[ - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 1, - ..Default::default() - }, - ]); - assert_eq!(storage.first_index().unwrap(), 1); - assert_eq!(storage.last_index().unwrap(), 3); - } - - #[test] - fn test_first_last_index_thread_safety() { - let storage = Arc::new(MemStorage::new()); - - // Populate storage - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.append(&entries); - - // Spawn multiple threads reading first_index and last_index concurrently - let handles: Vec<_> = (0..10) - .map(|_| { - let storage_clone = Arc::clone(&storage); - thread::spawn(move || { - let first = storage_clone.first_index().unwrap(); - let last = storage_clone.last_index().unwrap(); - assert_eq!(first, 1, "first_index should be 1"); - assert_eq!(last, 3, "last_index should be 3"); - assert!( - first <= last + 1, - "Invariant should hold: first_index <= last_index + 1" - ); - }) - }) - .collect(); - - for handle in handles { - handle.join().expect("Thread should not panic"); - } - } - - #[test] - fn test_first_last_index_consistency() { - let storage = MemStorage::new(); - - // Test that multiple consecutive calls return the same values - for _ in 0..100 { - let first1 = storage.first_index().unwrap(); - let last1 = storage.last_index().unwrap(); - let first2 = storage.first_index().unwrap(); - let last2 = storage.last_index().unwrap(); - - assert_eq!(first1, first2, "Consecutive first_index calls should match"); - assert_eq!(last1, last2, "Consecutive last_index calls should match"); - } - - // Add entries and test again - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - ]; - storage.append(&entries); - - for _ in 0..100 { - let first1 = storage.first_index().unwrap(); - let last1 = storage.last_index().unwrap(); - let first2 = storage.first_index().unwrap(); - let last2 = storage.last_index().unwrap(); - - assert_eq!(first1, first2, "Consecutive first_index calls should match"); - assert_eq!(last1, last2, "Consecutive last_index calls should match"); - } - } - - #[test] - fn test_first_last_index_with_large_snapshot() { - let storage = MemStorage::new(); - - // Create a snapshot at a large index - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 1_000_000; - snapshot.mut_metadata().term = 100; - *storage.snapshot.write().unwrap() = snapshot; - - let first = storage.first_index().unwrap(); - let last = storage.last_index().unwrap(); - - assert_eq!(first, 1_000_001, "first_index should be snapshot.index + 1"); - assert_eq!(last, 1_000_000, "last_index should be snapshot.index"); - assert!( - first <= last + 1, - "Invariant should hold even with large indices" - ); - } - - #[test] - fn test_first_last_index_multiple_scenarios() { - let storage = MemStorage::new(); - - // Scenario 1: Empty - assert_eq!(storage.first_index().unwrap(), 1); - assert_eq!(storage.last_index().unwrap(), 0); - - // Scenario 2: Add entries - storage.append(&[ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - ]); - assert_eq!(storage.first_index().unwrap(), 1); - assert_eq!(storage.last_index().unwrap(), 2); - - // Scenario 3: Add more entries - storage.append(&[ - Entry { - index: 3, - term: 2, - ..Default::default() - }, - Entry { - index: 4, - term: 2, - ..Default::default() - }, - Entry { - index: 5, - term: 3, - ..Default::default() - }, - ]); - assert_eq!(storage.first_index().unwrap(), 1); - assert_eq!(storage.last_index().unwrap(), 5); - - // Scenario 4: Add snapshot (simulate compaction) - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 3; - snapshot.mut_metadata().term = 2; - *storage.snapshot.write().unwrap() = snapshot; - assert_eq!(storage.first_index().unwrap(), 4); - assert_eq!(storage.last_index().unwrap(), 5); - - // Scenario 5: Add more entries after snapshot - storage.append(&[ - Entry { - index: 6, - term: 3, - ..Default::default() - }, - Entry { - index: 7, - term: 4, - ..Default::default() - }, - ]); - assert_eq!(storage.first_index().unwrap(), 4); - assert_eq!(storage.last_index().unwrap(), 7); - } - - // ============================================================================ - // Tests for snapshot() method - // ============================================================================ - - #[test] - fn test_snapshot_returns_default_on_new_storage() { - let storage = MemStorage::new(); - - // Empty storage should return default snapshot - let result = storage.snapshot(0); - assert!(result.is_ok(), "snapshot() should succeed on new storage"); - - let snapshot = result.unwrap(); - assert_eq!( - snapshot.get_metadata().index, - 0, - "Default snapshot should have index 0" - ); - assert_eq!( - snapshot.get_metadata().term, - 0, - "Default snapshot should have term 0" - ); - assert!( - snapshot.data.is_empty(), - "Default snapshot should have empty data" - ); - } - - #[test] - fn test_snapshot_returns_stored_snapshot() { - let storage = MemStorage::new(); - - // Create and store a snapshot - let mut snap = Snapshot::default(); - snap.mut_metadata().index = 10; - snap.mut_metadata().term = 3; - snap.data = vec![1, 2, 3, 4, 5]; - *storage.snapshot.write().unwrap() = snap; - - // Retrieve snapshot - let result = storage.snapshot(0); - assert!(result.is_ok(), "snapshot() should succeed"); - - let retrieved = result.unwrap(); - assert_eq!( - retrieved.get_metadata().index, - 10, - "Should return stored snapshot index" - ); - assert_eq!( - retrieved.get_metadata().term, - 3, - "Should return stored snapshot term" - ); - assert_eq!( - retrieved.data, - vec![1, 2, 3, 4, 5], - "Should return stored snapshot data" - ); - } - - #[test] - fn test_snapshot_ignores_request_index_in_phase_1() { - let storage = MemStorage::new(); - - // Store a snapshot at index 10 - let mut snap = Snapshot::default(); - snap.mut_metadata().index = 10; - snap.mut_metadata().term = 3; - *storage.snapshot.write().unwrap() = snap; - - // Request snapshot with different request_index values - // In Phase 1, all should return the same snapshot - let snap0 = storage.snapshot(0).unwrap(); - let snap5 = storage.snapshot(5).unwrap(); - let snap10 = storage.snapshot(10).unwrap(); - let snap100 = storage.snapshot(100).unwrap(); - - // All should be identical - assert_eq!(snap0.get_metadata().index, 10); - assert_eq!(snap5.get_metadata().index, 10); - assert_eq!(snap10.get_metadata().index, 10); - assert_eq!(snap100.get_metadata().index, 10); - } - - #[test] - fn test_snapshot_with_metadata() { - let storage = MemStorage::new(); - - // Create snapshot with complex metadata - let mut snap = Snapshot::default(); - snap.mut_metadata().index = 42; - snap.mut_metadata().term = 7; - - // Set configuration in metadata - snap.mut_metadata().conf_state = Some(ConfState { - voters: vec![1, 2, 3], - learners: vec![4, 5], - ..Default::default() - }); - - *storage.snapshot.write().unwrap() = snap; - - // Retrieve and verify - let retrieved = storage.snapshot(0).unwrap(); - assert_eq!(retrieved.get_metadata().index, 42); - assert_eq!(retrieved.get_metadata().term, 7); - assert_eq!( - retrieved.get_metadata().conf_state.as_ref().unwrap().voters, - vec![1, 2, 3] - ); - assert_eq!( - retrieved - .get_metadata() - .conf_state - .as_ref() - .unwrap() - .learners, - vec![4, 5] - ); - } - - #[test] - fn test_snapshot_with_data() { - let storage = MemStorage::new(); - - // Create snapshot with substantial data - let mut snap = Snapshot::default(); - snap.mut_metadata().index = 100; - snap.mut_metadata().term = 10; - snap.data = vec![0; 10_000]; // 10KB of data - *storage.snapshot.write().unwrap() = snap; - - // Retrieve and verify - let retrieved = storage.snapshot(0).unwrap(); - assert_eq!(retrieved.get_metadata().index, 100); - assert_eq!(retrieved.get_metadata().term, 10); - assert_eq!(retrieved.data.len(), 10_000); - assert!(retrieved.data.iter().all(|&b| b == 0)); - } - - #[test] - fn test_snapshot_returns_cloned_data() { - let storage = MemStorage::new(); - - // Store initial snapshot - let mut snap = Snapshot::default(); - snap.mut_metadata().index = 5; - snap.mut_metadata().term = 2; - snap.data = vec![1, 2, 3]; - *storage.snapshot.write().unwrap() = snap; - - // Get first snapshot - let snap1 = storage.snapshot(0).unwrap(); - - // Modify storage snapshot - let mut new_snap = Snapshot::default(); - new_snap.mut_metadata().index = 10; - new_snap.mut_metadata().term = 5; - new_snap.data = vec![4, 5, 6]; - *storage.snapshot.write().unwrap() = new_snap; - - // Get second snapshot - let snap2 = storage.snapshot(0).unwrap(); - - // Verify snap1 is unaffected by later changes - assert_eq!( - snap1.get_metadata().index, - 5, - "First snapshot should be unaffected" - ); - assert_eq!( - snap1.get_metadata().term, - 2, - "First snapshot term should be unaffected" - ); - assert_eq!( - snap1.data, - vec![1, 2, 3], - "First snapshot data should be unaffected" - ); - - // Verify snap2 has new values - assert_eq!( - snap2.get_metadata().index, - 10, - "Second snapshot should have new values" - ); - assert_eq!( - snap2.get_metadata().term, - 5, - "Second snapshot should have new term" - ); - assert_eq!( - snap2.data, - vec![4, 5, 6], - "Second snapshot should have new data" - ); - } - - #[test] - fn test_snapshot_is_thread_safe() { - let storage = Arc::new(MemStorage::new()); - - // Store a snapshot - let mut snap = Snapshot::default(); - snap.mut_metadata().index = 20; - snap.mut_metadata().term = 4; - snap.data = vec![10, 20, 30, 40, 50]; - *storage.snapshot.write().unwrap() = snap; - - // Spawn multiple threads reading snapshot concurrently - let handles: Vec<_> = (0..10) - .map(|_| { - let storage_clone = Arc::clone(&storage); - thread::spawn(move || { - // Each thread reads the snapshot 100 times - for request_idx in 0..100 { - let result = storage_clone.snapshot(request_idx); - assert!(result.is_ok(), "snapshot() should succeed"); - - let snapshot = result.unwrap(); - assert_eq!( - snapshot.get_metadata().index, - 20, - "Snapshot index should be consistent" - ); - assert_eq!( - snapshot.get_metadata().term, - 4, - "Snapshot term should be consistent" - ); - assert_eq!( - snapshot.data, - vec![10, 20, 30, 40, 50], - "Snapshot data should be consistent" - ); - } - }) - }) - .collect(); - - // Wait for all threads to complete - for handle in handles { - handle.join().expect("Thread should not panic"); - } - } - - // ============================================================================ - // Tests for apply_snapshot() method - // ============================================================================ - - #[test] - fn test_apply_snapshot_replaces_all_state() { - let storage = MemStorage::new(); - - // Add some initial entries - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.append(&entries); - - // Create a snapshot at index 5 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 5; - snapshot.mut_metadata().term = 3; - snapshot.mut_metadata().conf_state = Some(ConfState { - voters: vec![1, 2, 3], - ..Default::default() - }); - snapshot.data = vec![10, 20, 30]; - - // Apply snapshot - let result = storage.apply_snapshot(snapshot.clone()); - assert!(result.is_ok(), "apply_snapshot should succeed"); - - // Verify snapshot was stored - let stored_snap = storage.snapshot(0).unwrap(); - assert_eq!(stored_snap.get_metadata().index, 5); - assert_eq!(stored_snap.get_metadata().term, 3); - assert_eq!(stored_snap.data, vec![10, 20, 30]); - - // Verify entries covered by snapshot were removed - let remaining_entries = storage.entries.read().unwrap(); - assert!( - remaining_entries.is_empty(), - "All entries should be removed as they are covered by snapshot" - ); - } - - #[test] - fn test_apply_snapshot_clears_entries_covered_by_snapshot() { - let storage = MemStorage::new(); - - // Add entries 1-10 - let entries: Vec = (1..=10) - .map(|i| Entry { - index: i, - term: 1, - ..Default::default() - }) - .collect(); - storage.append(&entries); - - // Apply snapshot at index 5 - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 5; - snapshot.mut_metadata().term = 2; - - storage.apply_snapshot(snapshot).unwrap(); - - // Only entries 6-10 should remain - let remaining = storage.entries.read().unwrap(); - assert_eq!( - remaining.len(), - 5, - "Only entries after snapshot should remain" - ); - assert_eq!( - remaining[0].index, 6, - "First remaining entry should be index 6" - ); - assert_eq!( - remaining[4].index, 10, - "Last remaining entry should be index 10" - ); - } - - #[test] - fn test_apply_snapshot_updates_hard_state() { - let storage = MemStorage::new(); - - // Set initial hard state - storage.set_hard_state(HardState { - term: 1, - vote: 1, - commit: 2, - }); - - // Apply snapshot with higher term and commit - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 10; - snapshot.mut_metadata().term = 5; - - storage.apply_snapshot(snapshot).unwrap(); - - // Verify hard state was updated - let hard_state = storage.hard_state.read().unwrap(); - assert_eq!( - hard_state.term, 5, - "Term should be updated to snapshot term" - ); - assert_eq!( - hard_state.commit, 10, - "Commit should be updated to snapshot index" - ); - } - - #[test] - fn test_apply_snapshot_preserves_higher_hard_state_values() { - let storage = MemStorage::new(); - - // Set high commit - storage.set_hard_state(HardState { - term: 10, - vote: 1, - commit: 20, - }); - - // Apply snapshot with lower values - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 5; - snapshot.mut_metadata().term = 3; - - storage.apply_snapshot(snapshot).unwrap(); - - // Verify higher values were preserved - let hard_state = storage.hard_state.read().unwrap(); - assert_eq!(hard_state.term, 10, "Higher term should be preserved"); - assert_eq!(hard_state.commit, 20, "Higher commit should be preserved"); - } - - #[test] - fn test_apply_snapshot_updates_conf_state() { - let storage = MemStorage::new(); - - // Set initial conf state - storage.set_conf_state(ConfState { - voters: vec![1, 2], - learners: vec![3], - ..Default::default() - }); - - // Apply snapshot with different conf state - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 10; - snapshot.mut_metadata().term = 5; - snapshot.mut_metadata().conf_state = Some(ConfState { - voters: vec![4, 5, 6], - learners: vec![7, 8], - ..Default::default() - }); - - storage.apply_snapshot(snapshot).unwrap(); - - // Verify conf state was updated - let conf_state = storage.conf_state.read().unwrap(); - assert_eq!( - conf_state.voters, - vec![4, 5, 6], - "Voters should be updated from snapshot" - ); - assert_eq!( - conf_state.learners, - vec![7, 8], - "Learners should be updated from snapshot" - ); - } - - #[test] - fn test_apply_snapshot_with_no_conf_state_in_metadata() { - let storage = MemStorage::new(); - - // Set initial conf state - storage.set_conf_state(ConfState { - voters: vec![1, 2, 3], - ..Default::default() - }); - - // Apply snapshot without conf_state in metadata - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 10; - snapshot.mut_metadata().term = 5; - // Don't set conf_state - - storage.apply_snapshot(snapshot).unwrap(); - - // Verify conf state was not changed - let conf_state = storage.conf_state.read().unwrap(); - assert_eq!( - conf_state.voters, - vec![1, 2, 3], - "Conf state should remain unchanged when snapshot has no conf_state" - ); - } - - #[test] - fn test_apply_snapshot_thread_safety() { - let storage = Arc::new(MemStorage::new()); - - // Add initial entries - let entries: Vec = (1..=20) - .map(|i| Entry { - index: i, - term: 1, - ..Default::default() - }) - .collect(); - storage.append(&entries); - - // Create multiple snapshots - let snapshots: Vec = (1..=5) - .map(|i| { - let mut snap = Snapshot::default(); - snap.mut_metadata().index = i * 5; - snap.mut_metadata().term = i; - snap.data = vec![i as u8; 100]; - snap - }) - .collect(); - - // Apply snapshots concurrently (should be serialized by write locks) - let handles: Vec<_> = snapshots - .into_iter() - .map(|snap| { - let storage_clone = Arc::clone(&storage); - thread::spawn(move || { - storage_clone.apply_snapshot(snap).unwrap(); - }) - }) - .collect(); - - // Wait for all threads - for handle in handles { - handle.join().expect("Thread should not panic"); - } - - // Verify final state is consistent (one of the snapshots was applied) - let final_snap = storage.snapshot(0).unwrap(); - assert!( - final_snap.get_metadata().index > 0, - "A snapshot should have been applied" - ); - - // Verify entries are consistent with snapshot - let entries = storage.entries.read().unwrap(); - if !entries.is_empty() { - assert!( - entries[0].index > final_snap.get_metadata().index, - "Remaining entries should be after snapshot index" - ); - } - } - - #[test] - fn test_apply_snapshot_empty_log() { - let storage = MemStorage::new(); - - // Apply snapshot on empty log - let mut snapshot = Snapshot::default(); - snapshot.mut_metadata().index = 5; - snapshot.mut_metadata().term = 2; - snapshot.data = vec![1, 2, 3]; - - let result = storage.apply_snapshot(snapshot.clone()); - assert!(result.is_ok(), "apply_snapshot should succeed on empty log"); - - // Verify snapshot was stored - let stored = storage.snapshot(0).unwrap(); - assert_eq!(stored.get_metadata().index, 5); - assert_eq!(stored.get_metadata().term, 2); - assert_eq!(stored.data, vec![1, 2, 3]); - } - - // ============================================================================ - // Tests for wl_append_entries() method - // ============================================================================ - - #[test] - fn test_wl_append_entries_to_empty_log() { - let storage = MemStorage::new(); - - // Append to empty log - let entries = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - - let result = storage.wl_append_entries(&entries); - assert!(result.is_ok(), "wl_append_entries should succeed"); - - // Verify entries were appended - assert_eq!(storage.last_index().unwrap(), 3); - let stored = storage.entries.read().unwrap(); - assert_eq!(stored.len(), 3); - assert_eq!(stored[0].index, 1); - assert_eq!(stored[2].index, 3); - } - - #[test] - fn test_wl_append_entries_after_existing_entries() { - let storage = MemStorage::new(); - - // Add initial entries - let entries1 = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries1).unwrap(); - - // Append more entries after existing ones - let entries2 = vec![ - Entry { - index: 3, - term: 2, - ..Default::default() - }, - Entry { - index: 4, - term: 2, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries2).unwrap(); - - // Verify all entries are present - assert_eq!(storage.last_index().unwrap(), 4); - let stored = storage.entries.read().unwrap(); - assert_eq!(stored.len(), 4); - assert_eq!(stored[0].index, 1); - assert_eq!(stored[3].index, 4); - } - - #[test] - fn test_wl_append_entries_truncates_conflicting_entries() { - let storage = MemStorage::new(); - - // Add initial entries in term 1 - let entries1 = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 1, - ..Default::default() - }, - Entry { - index: 4, - term: 1, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries1).unwrap(); - - // Append conflicting entries (term 2 starting at index 2) - let entries2 = vec![ - Entry { - index: 2, - term: 2, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries2).unwrap(); - - // Verify old entries were truncated and new ones appended - assert_eq!(storage.last_index().unwrap(), 3); - let stored = storage.entries.read().unwrap(); - assert_eq!(stored.len(), 3); - assert_eq!(stored[0].index, 1); - assert_eq!(stored[0].term, 1); // First entry unchanged - assert_eq!(stored[1].index, 2); - assert_eq!(stored[1].term, 2); // Replaced with term 2 - assert_eq!(stored[2].index, 3); - assert_eq!(stored[2].term, 2); // Replaced with term 2 - } - - #[test] - fn test_wl_append_entries_no_conflict_when_terms_match() { - let storage = MemStorage::new(); - - // Add initial entries - let entries1 = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries1).unwrap(); - - // Append entries with matching terms (should not truncate) - let entries2 = vec![ - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 2, - ..Default::default() - }, - Entry { - index: 4, - term: 2, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries2).unwrap(); - - // Verify no truncation occurred, new entry was appended - assert_eq!(storage.last_index().unwrap(), 4); - let stored = storage.entries.read().unwrap(); - assert_eq!(stored.len(), 4); - assert_eq!(stored[0].term, 1); - assert_eq!(stored[1].term, 1); - assert_eq!(stored[2].term, 2); - assert_eq!(stored[3].term, 2); - } - - #[test] - fn test_wl_append_entries_empty_slice() { - let storage = MemStorage::new(); - - // Add initial entries - let entries1 = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries1).unwrap(); - - // Append empty slice (should be no-op) - let empty: Vec = vec![]; - let result = storage.wl_append_entries(&empty); - assert!(result.is_ok(), "Empty append should succeed"); - - // Verify nothing changed - assert_eq!(storage.last_index().unwrap(), 2); - let stored = storage.entries.read().unwrap(); - assert_eq!(stored.len(), 2); - } - - #[test] - fn test_wl_append_entries_before_existing_log() { - let storage = MemStorage::new(); - - // Add entries starting at index 10 - let entries1 = vec![ - Entry { - index: 10, - term: 2, - ..Default::default() - }, - Entry { - index: 11, - term: 2, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries1).unwrap(); - - // Append entries starting at index 1 (before existing log) - let entries2 = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries2).unwrap(); - - // Should replace entire log - let stored = storage.entries.read().unwrap(); - assert_eq!(stored.len(), 2); - assert_eq!(stored[0].index, 1); - assert_eq!(stored[1].index, 2); - } - - #[test] - fn test_wl_append_entries_thread_safety() { - let storage = Arc::new(MemStorage::new()); - - // Start with some initial entries using the helper method - storage.append(&[ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 1, - ..Default::default() - }, - ]); - - // Spawn multiple threads all appending the same extension - // This tests that concurrent writes are properly serialized by the write lock - let handles: Vec<_> = (0..10) - .map(|_| { - let storage_clone = Arc::clone(&storage); - thread::spawn(move || { - // All threads try to append entries 4 and 5 - let entries = vec![ - Entry { - index: 4, - term: 2, - ..Default::default() - }, - Entry { - index: 5, - term: 2, - ..Default::default() - }, - ]; - storage_clone.wl_append_entries(&entries).unwrap(); - }) - }) - .collect(); - - // Wait for all threads - for handle in handles { - handle.join().expect("Thread should not panic"); - } - - // Verify final state is consistent - should have entries 1-5, no corruption - let stored = storage.entries.read().unwrap(); - assert_eq!(stored.len(), 5, "Should have exactly 5 entries"); - assert_eq!(stored[0].index, 1); - assert_eq!(stored[3].index, 4); - assert_eq!(stored[4].index, 5); - assert_eq!(stored[3].term, 2); - assert_eq!(stored[4].term, 2); - - // Verify indices are contiguous - for i in 1..stored.len() { - assert_eq!( - stored[i].index, - stored[i - 1].index + 1, - "Indices should be contiguous" - ); - } - } - - #[test] - fn test_wl_append_entries_complex_conflict_resolution() { - let storage = MemStorage::new(); - - // Build log: [1:1, 2:1, 3:1, 4:2, 5:2] - let entries1 = vec![ - Entry { - index: 1, - term: 1, - ..Default::default() - }, - Entry { - index: 2, - term: 1, - ..Default::default() - }, - Entry { - index: 3, - term: 1, - ..Default::default() - }, - Entry { - index: 4, - term: 2, - ..Default::default() - }, - Entry { - index: 5, - term: 2, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries1).unwrap(); - - // Conflict at index 3: [3:3, 4:3, 5:3, 6:3] - let entries2 = vec![ - Entry { - index: 3, - term: 3, - ..Default::default() - }, - Entry { - index: 4, - term: 3, - ..Default::default() - }, - Entry { - index: 5, - term: 3, - ..Default::default() - }, - Entry { - index: 6, - term: 3, - ..Default::default() - }, - ]; - storage.wl_append_entries(&entries2).unwrap(); - - // Should have: [1:1, 2:1, 3:3, 4:3, 5:3, 6:3] - let stored = storage.entries.read().unwrap(); - assert_eq!(stored.len(), 6); - assert_eq!(stored[0].index, 1); - assert_eq!(stored[0].term, 1); - assert_eq!(stored[1].index, 2); - assert_eq!(stored[1].term, 1); - assert_eq!(stored[2].index, 3); - assert_eq!(stored[2].term, 3); - assert_eq!(stored[3].index, 4); - assert_eq!(stored[3].term, 3); - assert_eq!(stored[4].index, 5); - assert_eq!(stored[4].term, 3); - assert_eq!(stored[5].index, 6); - assert_eq!(stored[5].term, 3); - } -} diff --git a/crates/storage/Cargo.toml b/crates/storage/Cargo.toml index c6acbe7..9c91eb4 100644 --- a/crates/storage/Cargo.toml +++ b/crates/storage/Cargo.toml @@ -9,3 +9,10 @@ description.workspace = true keywords.workspace = true [dependencies] +# raft-rs 0.7 requires prost 0.11 for eraftpb types (Entry, HardState, etc.) +raft = { version = "0.7", default-features = false, features = ["prost-codec"] } + +# TODO: Remove after OpenRaft migration (see docs/specs/openraft/) +# This is temporary tech debt to support raft-rs 0.7 which requires prost 0.11 +# OpenRaft uses prost 0.14, eliminating this version conflict +prost-old = { package = "prost", version = "0.11" } diff --git a/crates/storage/src/lib.rs b/crates/storage/src/lib.rs index b93cf3f..94db3bb 100644 --- a/crates/storage/src/lib.rs +++ b/crates/storage/src/lib.rs @@ -1,14 +1,3600 @@ -pub fn add(left: u64, right: u64) -> u64 { - left + right +//! In-memory storage implementation for Raft consensus. +//! +//! This module provides `MemStorage`, an in-memory implementation suitable for +//! testing and development. For production use, a persistent storage backend +//! (e.g., RocksDB) should be used instead. +//! +//! # Protobuf Version Bridging +//! +//! This module uses `prost_old` (prost 0.11) to maintain compatibility with `raft-rs`, +//! which depends on prost 0.11. Our transport layer uses the latest prost 0.14 for +//! gRPC communication with tonic 0.14. The bridging happens in the transport layer +//! via binary serialization/deserialization. +//! +//! - `prost_old` (0.11): Used here for raft-rs `eraftpb` types (Entry, HardState, etc.) +//! - `prost` (0.14): Used in transport layer for gRPC wire protocol +//! +//! # Thread Safety +//! +//! All fields are wrapped in `RwLock` to provide thread-safe concurrent access. +//! Multiple readers can access the data simultaneously, but writers have exclusive access. +//! +//! ## Lock Poisoning Philosophy +//! +//! This implementation uses `.expect()` instead of `.unwrap()` for lock acquisition +//! to provide clear error messages when lock poisoning occurs. Lock poisoning indicates +//! that a thread panicked while holding the lock, leaving the data in a potentially +//! inconsistent state. +//! +//! **For Phase 1 (MemStorage)**: Lock poisoning is considered a serious bug that should +//! cause the application to panic immediately with a descriptive message. This approach +//! is acceptable because: +//! 1. MemStorage is used for testing and single-node scenarios +//! 2. Lock poisoning indicates a critical bug in the concurrent access logic +//! 3. Continuing with poisoned state would lead to data corruption +//! +//! **For Future Production Storage (RocksDB)**: Lock poisoning should be handled gracefully +//! by returning a proper error through the Raft error system, allowing the node to +//! potentially recover or fail safely without cascading panics. +//! +//! The `.expect()` messages clearly identify which lock failed, making debugging easier +//! during development and testing. + +use prost_old::Message; +use raft::eraftpb::{ConfState, Entry, HardState, Snapshot}; +use raft::{RaftState, StorageError}; +use std::sync::RwLock; + +/// In-memory storage for Raft state. +/// +/// `MemStorage` stores all Raft consensus state in memory: +/// - `hard_state`: Persistent voting state (term, vote, commit) +/// - `conf_state`: Cluster membership configuration +/// - `entries`: Log entries for replication +/// - `snapshot`: Snapshot data for log compaction +/// +/// # Examples +/// +/// ``` +/// use seshat_storage::MemStorage; +/// +/// let storage = MemStorage::new(); +/// // Storage is ready to use with default values +/// ``` +#[derive(Debug)] +pub struct MemStorage { + /// Persistent state that must survive crashes. + /// + /// Contains the current term, the candidate that received the vote + /// in the current term, and the highest log entry known to be committed. + hard_state: RwLock, + + /// Current cluster membership configuration. + /// + /// Tracks which nodes are voters, learners, and which nodes are + /// being added or removed from the cluster. + conf_state: RwLock, + + /// Log entries for state machine replication. + /// + /// Entries are indexed starting at 1. The vector may not start at index 1 + /// after log compaction (snapshot creation). + entries: RwLock>, + + /// Current snapshot for log compaction. + /// + /// Represents the state machine state at a particular point in time, + /// allowing truncation of old log entries. + snapshot: RwLock, +} + +impl MemStorage { + /// Creates a new `MemStorage` with default values. + /// + /// All fields are initialized to their default states: + /// - Empty hard state (term=0, vote=0, commit=0) + /// - Empty configuration state + /// - Empty log entries + /// - Empty snapshot + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// + /// let storage = MemStorage::new(); + /// // Storage is now ready to use + /// ``` + pub fn new() -> Self { + Self { + hard_state: RwLock::new(HardState::default()), + conf_state: RwLock::new(ConfState::default()), + entries: RwLock::new(Vec::new()), + snapshot: RwLock::new(Snapshot::default()), + } + } + + /// Returns the initial Raft state from storage. + /// + /// This method reads the current hard state and configuration state + /// from the storage and returns them as a `RaftState`. This is typically + /// called when initializing a Raft node to restore its persisted state. + /// + /// # Thread Safety + /// + /// This method acquires read locks on both `hard_state` and `conf_state`. + /// Multiple concurrent calls are safe and efficient. + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// + /// let storage = MemStorage::new(); + /// let state = storage.initial_state().unwrap(); + /// assert_eq!(state.hard_state.term, 0); + /// assert_eq!(state.hard_state.vote, 0); + /// assert_eq!(state.hard_state.commit, 0); + /// ``` + /// + /// # Errors + /// + /// Returns an error if: + /// - Lock acquisition fails (lock poisoning) + pub fn initial_state(&self) -> raft::Result { + let hard_state = self + .hard_state + .read() + .expect("Hard state lock poisoned - indicates bug in concurrent access"); + let conf_state = self + .conf_state + .read() + .expect("Conf state lock poisoned - indicates bug in concurrent access"); + + Ok(RaftState { + hard_state: hard_state.clone(), + conf_state: conf_state.clone(), + }) + } + + /// Sets the hard state of the storage. + /// + /// This is primarily used for testing and during Raft ready processing + /// to persist the updated hard state. + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// use raft::eraftpb::HardState; + /// + /// let storage = MemStorage::new(); + /// let mut hs = HardState::default(); + /// hs.term = 5; + /// hs.vote = 1; + /// hs.commit = 10; + /// storage.set_hard_state(hs); + /// + /// let state = storage.initial_state().unwrap(); + /// assert_eq!(state.hard_state.term, 5); + /// assert_eq!(state.hard_state.vote, 1); + /// assert_eq!(state.hard_state.commit, 10); + /// ``` + pub fn set_hard_state(&self, hs: HardState) { + *self + .hard_state + .write() + .expect("Hard state lock poisoned - indicates bug in concurrent access") = hs; + } + + /// Sets the configuration state of the storage. + /// + /// This is primarily used for testing and during Raft ready processing + /// to persist the updated configuration state. + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// use raft::eraftpb::ConfState; + /// + /// let storage = MemStorage::new(); + /// let mut cs = ConfState::default(); + /// cs.voters = vec![1, 2, 3]; + /// storage.set_conf_state(cs); + /// + /// let state = storage.initial_state().unwrap(); + /// assert_eq!(state.conf_state.voters, vec![1, 2, 3]); + /// ``` + pub fn set_conf_state(&self, cs: ConfState) { + *self + .conf_state + .write() + .expect("Conf state lock poisoned - indicates bug in concurrent access") = cs; + } + + /// Returns a range of log entries. + /// + /// Returns log entries in the range `[low, high)`, limiting the total size + /// to `max_size` bytes if specified. + /// + /// # Arguments + /// + /// * `low` - The inclusive lower bound of the range (first index to return) + /// * `high` - The exclusive upper bound of the range (one past the last index) + /// * `max_size` - Optional maximum total size in bytes of returned entries + /// + /// # Returns + /// + /// Returns a `Result` containing: + /// - `Ok(Vec)` - The requested entries (may be empty if low == high) + /// - `Err(StorageError::Compacted)` - If `low` is less than `first_index()` + /// - `Err(StorageError::Unavailable)` - If `high` is greater than `last_index() + 1` + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// use raft::eraftpb::Entry; + /// + /// let storage = MemStorage::new(); + /// // With empty storage, requesting any range returns empty or error + /// let result = storage.entries(1, 1, None); + /// assert!(result.is_ok()); + /// assert_eq!(result.unwrap().len(), 0); + /// ``` + pub fn entries(&self, low: u64, high: u64, max_size: Option) -> raft::Result> { + // Handle empty range first + if low >= high { + return Ok(Vec::new()); + } + + // Acquire all locks once for consistent state (fixes TOCTOU race) + let snapshot = self + .snapshot + .read() + .expect("Snapshot lock poisoned - indicates bug in concurrent access"); + let entries = self + .entries + .read() + .expect("Entries lock poisoned - indicates bug in concurrent access"); + + // Calculate first and last indices from locked state + let first = if snapshot.get_metadata().index > 0 { + snapshot.get_metadata().index + 1 + } else if !entries.is_empty() { + entries[0].index + } else { + 1 + }; + + let last = if let Some(last_entry) = entries.last() { + last_entry.index + } else { + snapshot.get_metadata().index + }; + + // Check if low is before first available entry (compacted) + if low < first { + return Err(raft::Error::Store(StorageError::Compacted)); + } + + // Check if high is beyond available entries + // Note: high can be last_index + 1 (to request all entries up to and including last_index) + if high > last + 1 { + return Err(raft::Error::Store(StorageError::Unavailable)); + } + + // Handle empty log + if entries.is_empty() { + return Ok(Vec::new()); + } + + // Calculate slice bounds + // entries vector may not start at index 1 after compaction + let offset = entries[0].index; + + // Convert logical indices to vector indices + let start_idx = (low.saturating_sub(offset)) as usize; + let end_idx = (high.saturating_sub(offset)) as usize; + + // Ensure we don't go out of bounds + let start_idx = start_idx.min(entries.len()); + let end_idx = end_idx.min(entries.len()); + + // If start >= end, return empty + if start_idx >= end_idx { + return Ok(Vec::new()); + } + + // Get the slice + let mut result = Vec::new(); + let mut total_size: u64 = 0; + + for entry in &entries[start_idx..end_idx] { + // Calculate entry size using prost's encoded_len + let entry_size = entry.encoded_len() as u64; + + // If we have a size limit and we've already added at least one entry + // and adding this entry would exceed the limit, stop + if let Some(max) = max_size { + if !result.is_empty() && total_size + entry_size > max { + break; + } + } + + result.push(entry.clone()); + total_size += entry_size; + } + + // Always return at least one entry if any are available + // (even if it exceeds max_size) + if result.is_empty() && start_idx < end_idx { + result.push(entries[start_idx].clone()); + } + + Ok(result) + } + + /// Returns the term of the entry at the given index. + /// + /// # Arguments + /// + /// * `index` - The log index to query + /// + /// # Returns + /// + /// Returns a `Result` containing: + /// - `Ok(term)` - The term of the entry at the given index + /// - `Err(StorageError::Compacted)` - If the index has been compacted + /// - `Err(StorageError::Unavailable)` - If the index is not yet available + /// + /// # Special Cases + /// + /// - `term(0)` always returns `0` (by Raft convention) + /// - If `index == snapshot.metadata.index`, returns `snapshot.metadata.term` + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// use raft::eraftpb::Entry; + /// + /// let storage = MemStorage::new(); + /// + /// // Index 0 always returns term 0 + /// assert_eq!(storage.term(0).unwrap(), 0); + /// + /// // Add entries and query their terms + /// let entries = vec![ + /// Entry { index: 1, term: 1, ..Default::default() }, + /// Entry { index: 2, term: 2, ..Default::default() }, + /// ]; + /// storage.append(&entries); + /// assert_eq!(storage.term(1).unwrap(), 1); + /// assert_eq!(storage.term(2).unwrap(), 2); + /// ``` + pub fn term(&self, index: u64) -> raft::Result { + // Special case: index 0 always has term 0 + if index == 0 { + return Ok(0); + } + + // Acquire locks once for consistent state + let snapshot = self + .snapshot + .read() + .expect("Snapshot lock poisoned - indicates bug in concurrent access"); + let entries = self + .entries + .read() + .expect("Entries lock poisoned - indicates bug in concurrent access"); + + // Calculate bounds from locked state + let first = if snapshot.get_metadata().index > 0 { + snapshot.get_metadata().index + 1 + } else if !entries.is_empty() { + entries[0].index + } else { + 1 + }; + + let last = if let Some(last_entry) = entries.last() { + last_entry.index + } else { + snapshot.get_metadata().index + }; + + // Check if this is exactly the snapshot index + if index == snapshot.get_metadata().index { + return Ok(snapshot.get_metadata().term); + } + + // Check if index is before first available entry (compacted) + if index < first { + return Err(raft::Error::Store(StorageError::Compacted)); + } + + // Check if index is beyond available entries + if index > last { + return Err(raft::Error::Store(StorageError::Unavailable)); + } + + // Handle empty log (shouldn't happen given bounds checks, but be safe) + if entries.is_empty() { + return Err(raft::Error::Store(StorageError::Unavailable)); + } + + // Calculate offset + let offset = entries[0].index; + let vec_index = (index - offset) as usize; + + // Bounds check + if vec_index >= entries.len() { + return Err(raft::Error::Store(StorageError::Unavailable)); + } + + Ok(entries[vec_index].term) + } + + /// Returns the first index in the log. + /// + /// This is the index of the first entry available in the log. After log compaction, + /// this may be greater than 1 (the first entry that was ever appended). + /// + /// # Returns + /// + /// - If there's a snapshot, returns `snapshot.metadata.index + 1` + /// - Otherwise, returns 1 (the default first index) + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// + /// let storage = MemStorage::new(); + /// assert_eq!(storage.first_index().unwrap(), 1); + /// ``` + pub fn first_index(&self) -> raft::Result { + let snapshot = self + .snapshot + .read() + .expect("Snapshot lock poisoned - indicates bug in concurrent access"); + let entries = self + .entries + .read() + .expect("Entries lock poisoned - indicates bug in concurrent access"); + + if snapshot.get_metadata().index > 0 { + Ok(snapshot.get_metadata().index + 1) + } else if !entries.is_empty() { + Ok(entries[0].index) + } else { + Ok(1) + } + } + + /// Returns the last index in the log. + /// + /// This is the index of the last entry available in the log. + /// + /// # Returns + /// + /// - If there are entries, returns the index of the last entry + /// - If there's a snapshot but no entries, returns the snapshot index + /// - Otherwise, returns 0 (empty log) + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// + /// let storage = MemStorage::new(); + /// assert_eq!(storage.last_index().unwrap(), 0); + /// ``` + pub fn last_index(&self) -> raft::Result { + let entries = self + .entries + .read() + .expect("Entries lock poisoned - indicates bug in concurrent access"); + let snapshot = self + .snapshot + .read() + .expect("Snapshot lock poisoned - indicates bug in concurrent access"); + + if let Some(last) = entries.last() { + Ok(last.index) + } else { + Ok(snapshot.get_metadata().index) + } + } + + /// Returns the current snapshot. + /// + /// In Phase 1, this is simplified to always return the stored snapshot + /// regardless of the `request_index` parameter. In later phases, this + /// would check if the snapshot is ready for the given index. + /// + /// # Arguments + /// + /// * `request_index` - The index for which a snapshot is requested (unused in Phase 1) + /// + /// # Returns + /// + /// Returns a `Result` containing: + /// - `Ok(Snapshot)` - A clone of the current snapshot + /// + /// # Phase 1 Simplification + /// + /// This implementation ignores `request_index` and always returns the current + /// snapshot. Future phases may return `StorageError::SnapshotTemporarilyUnavailable` + /// if a snapshot is being created for a specific index. + /// + /// # Thread Safety + /// + /// This method acquires a read lock on the snapshot field. Multiple concurrent + /// calls are safe and efficient. + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// use raft::eraftpb::Snapshot; + /// + /// let storage = MemStorage::new(); + /// + /// // Empty storage returns default snapshot + /// let snapshot = storage.snapshot(0).unwrap(); + /// assert_eq!(snapshot.get_metadata().index, 0); + /// assert_eq!(snapshot.get_metadata().term, 0); + /// assert!(snapshot.data.is_empty()); + /// ``` + pub fn snapshot(&self, _request_index: u64) -> raft::Result { + // Phase 1: Simplified implementation + // Just return the current snapshot, ignoring request_index + let snapshot = self + .snapshot + .read() + .expect("Snapshot lock poisoned - indicates bug in concurrent access"); + Ok(snapshot.clone()) + } + + /// Appends entries to the log. + /// + /// This is a helper method for testing. In production use, entries are + /// typically appended through the Raft ready processing. + /// + /// # Arguments + /// + /// * `ents` - Slice of entries to append + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// use raft::eraftpb::Entry; + /// + /// let storage = MemStorage::new(); + /// let entries = vec![ + /// Entry { index: 1, term: 1, ..Default::default() }, + /// Entry { index: 2, term: 1, ..Default::default() }, + /// ]; + /// storage.append(&entries); + /// ``` + pub fn append(&self, ents: &[Entry]) { + let mut entries = self + .entries + .write() + .expect("Entries lock poisoned - indicates bug in concurrent access"); + entries.extend_from_slice(ents); + } + + /// Applies a snapshot to the storage. + /// + /// This method replaces the entire storage state with the given snapshot. + /// All log entries covered by the snapshot (entries with index <= snapshot.metadata.index) + /// are removed. The hard state and configuration state are updated from the snapshot metadata. + /// + /// # Arguments + /// + /// * `snapshot` - The snapshot to apply + /// + /// # Thread Safety + /// + /// This method acquires write locks on all storage fields. It is safe to call + /// concurrently with other methods, but write operations are serialized. + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// use raft::eraftpb::{Snapshot, ConfState}; + /// + /// let storage = MemStorage::new(); + /// + /// // Create a snapshot + /// let mut snapshot = Snapshot::default(); + /// snapshot.mut_metadata().index = 10; + /// snapshot.mut_metadata().term = 3; + /// snapshot.mut_metadata().conf_state = Some(ConfState { + /// voters: vec![1, 2, 3], + /// ..Default::default() + /// }); + /// snapshot.data = vec![1, 2, 3, 4, 5]; + /// + /// // Apply snapshot + /// storage.apply_snapshot(snapshot.clone()).unwrap(); + /// + /// // Verify snapshot was applied + /// let retrieved = storage.snapshot(0).unwrap(); + /// assert_eq!(retrieved.get_metadata().index, 10); + /// assert_eq!(retrieved.get_metadata().term, 3); + /// ``` + /// + /// # Errors + /// + /// Returns an error if: + /// - Lock acquisition fails (lock poisoning) + pub fn apply_snapshot(&self, snapshot: Snapshot) -> raft::Result<()> { + // Get snapshot index and term for updating hard_state + let snap_index = snapshot.get_metadata().index; + let snap_term = snapshot.get_metadata().term; + + // Acquire write locks in consistent order to prevent deadlocks + // Lock ordering: snapshot → entries → hard_state → conf_state (documented to prevent deadlocks) + let mut storage_snapshot = self + .snapshot + .write() + .expect("Snapshot lock poisoned - indicates bug in concurrent access"); + let mut entries = self + .entries + .write() + .expect("Entries lock poisoned - indicates bug in concurrent access"); + let mut hard_state = self + .hard_state + .write() + .expect("Hard state lock poisoned - indicates bug in concurrent access"); + let mut conf_state = self + .conf_state + .write() + .expect("Conf state lock poisoned - indicates bug in concurrent access"); + + // Replace snapshot + *storage_snapshot = snapshot.clone(); + + // Remove entries covered by the snapshot + // Keep only entries with index > snapshot.metadata.index + entries.retain(|entry| entry.index > snap_index); + + // Update hard_state commit to at least snapshot index + if hard_state.commit < snap_index { + hard_state.commit = snap_index; + } + // Update term if snapshot term is higher + if hard_state.term < snap_term { + hard_state.term = snap_term; + } + + // Update conf_state from snapshot metadata + if let Some(ref cs) = snapshot.get_metadata().conf_state { + *conf_state = cs.clone(); + } + + Ok(()) + } + + /// Appends entries to the log with proper conflict resolution. + /// + /// This method implements the Raft log append logic with truncation of conflicting + /// entries. If an incoming entry has the same index as an existing entry but a + /// different term, all entries from that point onwards are removed before appending + /// the new entries. + /// + /// # Arguments + /// + /// * `entries` - Slice of entries to append + /// + /// # Thread Safety + /// + /// This method acquires a write lock on the entries field. Multiple concurrent + /// calls are serialized. + /// + /// # Examples + /// + /// ``` + /// use seshat_storage::MemStorage; + /// use raft::eraftpb::Entry; + /// + /// let storage = MemStorage::new(); + /// + /// // Append initial entries + /// let entries1 = vec![ + /// Entry { index: 1, term: 1, ..Default::default() }, + /// Entry { index: 2, term: 1, ..Default::default() }, + /// Entry { index: 3, term: 1, ..Default::default() }, + /// ]; + /// storage.wl_append_entries(&entries1).unwrap(); + /// assert_eq!(storage.last_index().unwrap(), 3); + /// + /// // Append conflicting entries (will truncate from index 2) + /// let entries2 = vec![ + /// Entry { index: 2, term: 2, ..Default::default() }, + /// Entry { index: 3, term: 2, ..Default::default() }, + /// ]; + /// storage.wl_append_entries(&entries2).unwrap(); + /// assert_eq!(storage.last_index().unwrap(), 3); + /// assert_eq!(storage.term(2).unwrap(), 2); + /// assert_eq!(storage.term(3).unwrap(), 2); + /// ``` + /// + /// # Errors + /// + /// Returns an error if: + /// - Lock acquisition fails (lock poisoning) + pub fn wl_append_entries(&self, entries: &[Entry]) -> raft::Result<()> { + // Empty entries slice is valid - just return + if entries.is_empty() { + return Ok(()); + } + + // Acquire write lock on entries + let mut storage_entries = self + .entries + .write() + .expect("Entries lock poisoned - indicates bug in concurrent access"); + + // If storage is empty, just append all entries + if storage_entries.is_empty() { + storage_entries.extend_from_slice(entries); + return Ok(()); + } + + // Find the first conflicting entry + let first_new_index = entries[0].index; + let storage_offset = storage_entries[0].index; + + // If new entries start after our log, just append + // Note: storage_entries is guaranteed non-empty by check above + if first_new_index + > storage_entries + .last() + .expect("Storage entries non-empty - checked above") + .index + { + storage_entries.extend_from_slice(entries); + return Ok(()); + } + + // If new entries start before our log, we need to handle overlap + if first_new_index < storage_offset { + // New entries start before our log - this shouldn't happen normally + // but we'll handle it by clearing everything and appending + storage_entries.clear(); + storage_entries.extend_from_slice(entries); + return Ok(()); + } + + // Find conflict point + for (i, entry) in entries.iter().enumerate() { + let storage_idx = (entry.index - storage_offset) as usize; + + // If this entry is beyond our current log, append remaining entries + if storage_idx >= storage_entries.len() { + storage_entries.extend_from_slice(&entries[i..]); + return Ok(()); + } + + // Check for conflict + if storage_entries[storage_idx].term != entry.term { + // Found conflict - truncate from this point and append new entries + storage_entries.truncate(storage_idx); + storage_entries.extend_from_slice(&entries[i..]); + return Ok(()); + } + + // Terms match - this entry is already in the log, continue checking + } + + Ok(()) + } +} + +impl Default for MemStorage { + fn default() -> Self { + Self::new() + } +} + +impl raft::Storage for MemStorage { + fn initial_state(&self) -> raft::Result { + self.initial_state() + } + + fn entries( + &self, + low: u64, + high: u64, + max_size: impl Into>, + _context: raft::GetEntriesContext, + ) -> raft::Result> { + self.entries(low, high, max_size.into()) + } + + fn term(&self, idx: u64) -> raft::Result { + self.term(idx) + } + + fn first_index(&self) -> raft::Result { + self.first_index() + } + + fn last_index(&self) -> raft::Result { + self.last_index() + } + + fn snapshot(&self, request_index: u64, _to: u64) -> raft::Result { + self.snapshot(request_index) + } } #[cfg(test)] mod tests { use super::*; + use std::sync::Arc; + use std::thread; + + #[test] + fn test_mem_storage_new_creates_successfully() { + let storage = MemStorage::new(); + + // Verify storage was created without panicking + // We can't directly access the fields since they're private, + // but we can verify the storage exists + let _debug_output = format!("{storage:?}"); + } + + #[test] + fn test_mem_storage_default_creates_successfully() { + let storage = MemStorage::default(); + + // Verify default() works the same as new() + let _debug_output = format!("{storage:?}"); + } + + #[test] + fn test_mem_storage_has_default_hard_state() { + let storage = MemStorage::new(); + + // Access hard_state to verify it's initialized + let hard_state = storage.hard_state.read().unwrap(); + assert_eq!(hard_state.term, 0, "Initial term should be 0"); + assert_eq!(hard_state.vote, 0, "Initial vote should be 0"); + assert_eq!(hard_state.commit, 0, "Initial commit should be 0"); + } + + #[test] + fn test_mem_storage_has_default_conf_state() { + let storage = MemStorage::new(); + + // Access conf_state to verify it's initialized + let conf_state = storage.conf_state.read().unwrap(); + assert!( + conf_state.voters.is_empty(), + "Initial voters should be empty" + ); + assert!( + conf_state.learners.is_empty(), + "Initial learners should be empty" + ); + } + + #[test] + fn test_mem_storage_has_empty_entries() { + let storage = MemStorage::new(); + + // Access entries to verify it's an empty vector + let entries = storage.entries.read().unwrap(); + assert!(entries.is_empty(), "Initial entries should be empty"); + assert_eq!(entries.len(), 0, "Initial entries length should be 0"); + } + + #[test] + fn test_mem_storage_has_default_snapshot() { + let storage = MemStorage::new(); + + // Access snapshot to verify it's initialized + let snapshot = storage.snapshot.read().unwrap(); + assert!( + snapshot.data.is_empty(), + "Initial snapshot data should be empty" + ); + } + + #[test] + fn test_mem_storage_fields_are_thread_safe() { + let storage = MemStorage::new(); + + // Verify we can get read locks on all fields + let _hard_state = storage.hard_state.read().unwrap(); + let _conf_state = storage.conf_state.read().unwrap(); + let _entries = storage.entries.read().unwrap(); + let _snapshot = storage.snapshot.read().unwrap(); + + // All locks should be released when the guards go out of scope + } + + #[test] + fn test_mem_storage_multiple_readers() { + let storage = MemStorage::new(); + + // Verify multiple readers can access simultaneously + let _lock1 = storage.hard_state.read().unwrap(); + let _lock2 = storage.hard_state.read().unwrap(); + let _lock3 = storage.hard_state.read().unwrap(); + + // All read locks should coexist + } + + #[test] + fn test_mem_storage_write_lock() { + let storage = MemStorage::new(); + + // Verify we can get write locks + { + let mut hard_state = storage.hard_state.write().unwrap(); + hard_state.term = 1; + } + + // Verify the write persisted + let hard_state = storage.hard_state.read().unwrap(); + assert_eq!(hard_state.term, 1); + } + + #[test] + fn test_mem_storage_is_send() { + fn assert_send() {} + assert_send::(); + } + + #[test] + fn test_mem_storage_is_sync() { + fn assert_sync() {} + assert_sync::(); + } + + #[test] + fn test_mem_storage_can_be_used_across_threads() { + let storage = Arc::new(MemStorage::new()); + let storage_clone = Arc::clone(&storage); + + let handle = thread::spawn(move || { + let hard_state = storage_clone.hard_state.read().unwrap(); + assert_eq!(hard_state.term, 0); + }); + + handle.join().unwrap(); + } + + #[test] + fn test_mem_storage_independent_instances() { + let storage1 = MemStorage::new(); + let storage2 = MemStorage::new(); + + // Modify storage1 + { + let mut hard_state = storage1.hard_state.write().unwrap(); + hard_state.term = 5; + } + + // Verify storage2 is unaffected + let hard_state2 = storage2.hard_state.read().unwrap(); + assert_eq!(hard_state2.term, 0); + } + + // ============================================================================ + // Tests for initial_state() method + // ============================================================================ + + #[test] + fn test_initial_state_returns_defaults() { + let storage = MemStorage::new(); + + let state = storage + .initial_state() + .expect("initial_state should succeed"); + + // Verify default HardState + assert_eq!(state.hard_state.term, 0, "Default term should be 0"); + assert_eq!(state.hard_state.vote, 0, "Default vote should be 0"); + assert_eq!(state.hard_state.commit, 0, "Default commit should be 0"); + + // Verify default ConfState + assert!( + state.conf_state.voters.is_empty(), + "Default voters should be empty" + ); + assert!( + state.conf_state.learners.is_empty(), + "Default learners should be empty" + ); + } + + #[test] + fn test_initial_state_reflects_hard_state_changes() { + let storage = MemStorage::new(); + + // Modify hard_state + let new_hard_state = HardState { + term: 10, + vote: 3, + commit: 25, + }; + storage.set_hard_state(new_hard_state); + + // Verify initial_state reflects the change + let state = storage + .initial_state() + .expect("initial_state should succeed"); + assert_eq!(state.hard_state.term, 10, "Term should be updated to 10"); + assert_eq!(state.hard_state.vote, 3, "Vote should be updated to 3"); + assert_eq!( + state.hard_state.commit, 25, + "Commit should be updated to 25" + ); + } + + #[test] + fn test_initial_state_reflects_conf_state_changes() { + let storage = MemStorage::new(); + + // Modify conf_state + let new_conf_state = ConfState { + voters: vec![1, 2, 3], + learners: vec![4, 5], + ..Default::default() + }; + storage.set_conf_state(new_conf_state); + + // Verify initial_state reflects the change + let state = storage + .initial_state() + .expect("initial_state should succeed"); + assert_eq!( + state.conf_state.voters, + vec![1, 2, 3], + "Voters should be updated" + ); + assert_eq!( + state.conf_state.learners, + vec![4, 5], + "Learners should be updated" + ); + } + + #[test] + fn test_initial_state_is_thread_safe() { + let storage = Arc::new(MemStorage::new()); + + // Set initial values + let hs = HardState { + term: 5, + vote: 2, + commit: 10, + }; + storage.set_hard_state(hs); + + let cs = ConfState { + voters: vec![1, 2, 3], + ..Default::default() + }; + storage.set_conf_state(cs); + + // Spawn multiple threads calling initial_state + let handles: Vec<_> = (0..10) + .map(|_| { + let storage_clone = Arc::clone(&storage); + thread::spawn(move || { + let state = storage_clone + .initial_state() + .expect("initial_state should succeed"); + assert_eq!(state.hard_state.term, 5); + assert_eq!(state.hard_state.vote, 2); + assert_eq!(state.hard_state.commit, 10); + assert_eq!(state.conf_state.voters, vec![1, 2, 3]); + }) + }) + .collect(); + + // Wait for all threads to complete + for handle in handles { + handle.join().expect("Thread should not panic"); + } + } + + #[test] + fn test_initial_state_returns_cloned_data() { + let storage = MemStorage::new(); + + // Get initial state + let state1 = storage + .initial_state() + .expect("initial_state should succeed"); + + // Modify storage + let new_hard_state = HardState { + term: 100, + ..Default::default() + }; + storage.set_hard_state(new_hard_state); + + // Get initial state again + let state2 = storage + .initial_state() + .expect("initial_state should succeed"); + + // Verify state1 is independent of the change + assert_eq!( + state1.hard_state.term, 0, + "First state should not be affected by later changes" + ); + assert_eq!( + state2.hard_state.term, 100, + "Second state should reflect the change" + ); + } + + #[test] + fn test_initial_state_multiple_calls_are_consistent() { + let storage = MemStorage::new(); + + // Set specific values + let hs = HardState { + term: 42, + vote: 7, + commit: 99, + }; + storage.set_hard_state(hs); + + // Call initial_state multiple times + for _ in 0..100 { + let state = storage + .initial_state() + .expect("initial_state should succeed"); + assert_eq!(state.hard_state.term, 42); + assert_eq!(state.hard_state.vote, 7); + assert_eq!(state.hard_state.commit, 99); + } + } + + #[test] + fn test_set_hard_state_updates_storage() { + let storage = MemStorage::new(); + + // Create and set a new hard state + let hs = HardState { + term: 15, + vote: 8, + commit: 50, + }; + storage.set_hard_state(hs); + + // Verify the update by reading directly + let stored_hs = storage.hard_state.read().unwrap(); + assert_eq!(stored_hs.term, 15); + assert_eq!(stored_hs.vote, 8); + assert_eq!(stored_hs.commit, 50); + } + + #[test] + fn test_set_conf_state_updates_storage() { + let storage = MemStorage::new(); + + // Create and set a new conf state + let cs = ConfState { + voters: vec![10, 20, 30], + learners: vec![40], + ..Default::default() + }; + storage.set_conf_state(cs); + + // Verify the update by reading directly + let stored_cs = storage.conf_state.read().unwrap(); + assert_eq!(stored_cs.voters, vec![10, 20, 30]); + assert_eq!(stored_cs.learners, vec![40]); + } #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); + fn test_initial_state_with_empty_conf_state() { + let storage = MemStorage::new(); + + // Set only hard state, leave conf state empty + let hs = HardState { + term: 1, + ..Default::default() + }; + storage.set_hard_state(hs); + + let state = storage + .initial_state() + .expect("initial_state should succeed"); + assert_eq!(state.hard_state.term, 1); + assert!(state.conf_state.voters.is_empty()); + assert!(state.conf_state.learners.is_empty()); + } + + #[test] + fn test_initial_state_with_complex_conf_state() { + let storage = MemStorage::new(); + + // Create a complex configuration + let cs = ConfState { + voters: vec![1, 2, 3, 4, 5], + learners: vec![6, 7], + voters_outgoing: vec![1, 2, 3], // During configuration change + learners_next: vec![8], // Learners being added + auto_leave: true, + }; + storage.set_conf_state(cs.clone()); + + let state = storage + .initial_state() + .expect("initial_state should succeed"); + assert_eq!(state.conf_state.voters, cs.voters); + assert_eq!(state.conf_state.learners, cs.learners); + assert_eq!(state.conf_state.voters_outgoing, cs.voters_outgoing); + assert_eq!(state.conf_state.learners_next, cs.learners_next); + assert_eq!(state.conf_state.auto_leave, cs.auto_leave); + } + + // ============================================================================ + // Tests for entries() method + // ============================================================================ + + #[test] + fn test_entries_empty_range_returns_empty_vec() { + let storage = MemStorage::new(); + + // Query with low == high should return empty vector + let result = storage.entries(1, 1, None); + assert!(result.is_ok(), "Empty range should succeed"); + assert_eq!( + result.unwrap().len(), + 0, + "Empty range should return no entries" + ); + } + + #[test] + fn test_entries_empty_range_on_populated_storage() { + let storage = MemStorage::new(); + + // Add some entries + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 1, + ..Default::default() + }, + ]; + storage.append(&entries); + + // Query with low == high should still return empty + let result = storage.entries(2, 2, None); + assert!(result.is_ok(), "Empty range should succeed"); + assert_eq!( + result.unwrap().len(), + 0, + "Empty range should return no entries" + ); + } + + #[test] + fn test_entries_normal_range_returns_correct_entries() { + let storage = MemStorage::new(); + + // Add entries with indices 1, 2, 3, 4, 5 + let entries = vec![ + Entry { + index: 1, + term: 1, + data: vec![1], + ..Default::default() + }, + Entry { + index: 2, + term: 1, + data: vec![2], + ..Default::default() + }, + Entry { + index: 3, + term: 2, + data: vec![3], + ..Default::default() + }, + Entry { + index: 4, + term: 2, + data: vec![4], + ..Default::default() + }, + Entry { + index: 5, + term: 3, + data: vec![5], + ..Default::default() + }, + ]; + storage.append(&entries); + + // Query range [2, 5) should return entries 2, 3, 4 + let result = storage.entries(2, 5, None); + assert!(result.is_ok(), "Valid range should succeed"); + + let returned = result.unwrap(); + assert_eq!(returned.len(), 3, "Should return 3 entries"); + assert_eq!(returned[0].index, 2, "First entry should have index 2"); + assert_eq!(returned[1].index, 3, "Second entry should have index 3"); + assert_eq!(returned[2].index, 4, "Third entry should have index 4"); + assert_eq!(returned[0].data, vec![2], "First entry data should match"); + assert_eq!(returned[1].data, vec![3], "Second entry data should match"); + assert_eq!(returned[2].data, vec![4], "Third entry data should match"); + } + + #[test] + fn test_entries_single_entry_range() { + let storage = MemStorage::new(); + + let entries = vec![ + Entry { + index: 1, + term: 1, + data: vec![1], + ..Default::default() + }, + Entry { + index: 2, + term: 1, + data: vec![2], + ..Default::default() + }, + Entry { + index: 3, + term: 2, + data: vec![3], + ..Default::default() + }, + ]; + storage.append(&entries); + + // Query single entry [2, 3) + let result = storage.entries(2, 3, None); + assert!(result.is_ok(), "Single entry range should succeed"); + + let returned = result.unwrap(); + assert_eq!(returned.len(), 1, "Should return 1 entry"); + assert_eq!(returned[0].index, 2, "Entry should have index 2"); + assert_eq!(returned[0].data, vec![2], "Entry data should match"); + } + + #[test] + fn test_entries_full_range() { + let storage = MemStorage::new(); + + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.append(&entries); + + // Query all entries [1, 4) + let result = storage.entries(1, 4, None); + assert!(result.is_ok(), "Full range should succeed"); + + let returned = result.unwrap(); + assert_eq!(returned.len(), 3, "Should return all 3 entries"); + assert_eq!(returned[0].index, 1); + assert_eq!(returned[1].index, 2); + assert_eq!(returned[2].index, 3); + } + + #[test] + fn test_entries_with_max_size_returns_partial_results() { + let storage = MemStorage::new(); + + // Create entries with specific sizes + // Each entry has some overhead, so we'll use data to control size + let entries = vec![ + Entry { + index: 1, + term: 1, + data: vec![0; 100], + ..Default::default() + }, + Entry { + index: 2, + term: 1, + data: vec![0; 100], + ..Default::default() + }, + Entry { + index: 3, + term: 2, + data: vec![0; 100], + ..Default::default() + }, + Entry { + index: 4, + term: 2, + data: vec![0; 100], + ..Default::default() + }, + ]; + storage.append(&entries); + + // Request range [1, 5) with size limit that fits only first 2 entries + // Each entry is roughly 100+ bytes, so max_size of 250 should get us 2 entries + let result = storage.entries(1, 5, Some(250)); + assert!(result.is_ok(), "Size-limited query should succeed"); + + let returned = result.unwrap(); + assert!( + !returned.is_empty() && returned.len() < 4, + "Should return partial results (got {} entries)", + returned.len() + ); + assert_eq!(returned[0].index, 1, "First entry should have index 1"); + } + + #[test] + fn test_entries_with_max_size_returns_at_least_one_entry() { + let storage = MemStorage::new(); + + // Create entry larger than max_size + let entries = vec![ + Entry { + index: 1, + term: 1, + data: vec![0; 1000], + ..Default::default() + }, + Entry { + index: 2, + term: 1, + data: vec![0; 1000], + ..Default::default() + }, + ]; + storage.append(&entries); + + // Request with very small max_size - should still return at least first entry + let result = storage.entries(1, 3, Some(10)); + assert!(result.is_ok(), "Should succeed even with small max_size"); + + let returned = result.unwrap(); + assert_eq!(returned.len(), 1, "Should return at least one entry"); + assert_eq!(returned[0].index, 1, "Should return first entry"); + } + + #[test] + fn test_entries_error_when_low_less_than_first_index() { + let storage = MemStorage::new(); + + // Create a snapshot at index 5 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 5; + snapshot.mut_metadata().term = 2; + *storage.snapshot.write().unwrap() = snapshot; + + // Add entries starting from index 6 + let entries = vec![ + Entry { + index: 6, + term: 2, + ..Default::default() + }, + Entry { + index: 7, + term: 3, + ..Default::default() + }, + ]; + storage.append(&entries); + + // first_index() should be 6 (snapshot.index + 1) + // Requesting entries before that should fail + let result = storage.entries(4, 7, None); + assert!(result.is_err(), "Should error when low < first_index"); + + match result.unwrap_err() { + raft::Error::Store(StorageError::Compacted) => { + // Expected error + } + other => panic!("Expected StorageError::Compacted, got {other:?}"), + } + } + + #[test] + fn test_entries_error_when_high_greater_than_last_index_plus_one() { + let storage = MemStorage::new(); + + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.append(&entries); + + // last_index() is 3, so high can be at most 4 (last_index + 1) + // Requesting high > 4 should fail + let result = storage.entries(1, 5, None); + assert!(result.is_err(), "Should error when high > last_index + 1"); + + match result.unwrap_err() { + raft::Error::Store(StorageError::Unavailable) => { + // Expected error + } + other => panic!("Expected StorageError::Unavailable, got {other:?}"), + } + } + + #[test] + fn test_entries_boundary_at_last_index_plus_one() { + let storage = MemStorage::new(); + + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.append(&entries); + + // last_index() is 3, so high = 4 (last_index + 1) should be valid + let result = storage.entries(1, 4, None); + assert!(result.is_ok(), "high = last_index + 1 should be valid"); + + let returned = result.unwrap(); + assert_eq!(returned.len(), 3, "Should return all entries"); + } + + #[test] + fn test_entries_on_empty_storage() { + let storage = MemStorage::new(); + + // Empty storage: first_index = 1, last_index = 0 + // Valid range should be [1, 1) which returns empty + let result = storage.entries(1, 1, None); + assert!( + result.is_ok(), + "Empty range on empty storage should succeed" + ); + assert_eq!(result.unwrap().len(), 0); + + // Any request with high > 1 should fail (unavailable) + let result = storage.entries(1, 2, None); + assert!( + result.is_err(), + "Should error when requesting unavailable entries" + ); + + match result.unwrap_err() { + raft::Error::Store(StorageError::Unavailable) => { + // Expected + } + other => panic!("Expected StorageError::Unavailable, got {other:?}"), + } + } + + #[test] + fn test_entries_thread_safe() { + let storage = Arc::new(MemStorage::new()); + + // Populate storage + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + Entry { + index: 4, + term: 2, + ..Default::default() + }, + Entry { + index: 5, + term: 3, + ..Default::default() + }, + ]; + storage.append(&entries); + + // Spawn multiple threads reading concurrently + let handles: Vec<_> = (0..10) + .map(|_| { + let storage_clone = Arc::clone(&storage); + thread::spawn(move || { + let result = storage_clone.entries(2, 4, None); + assert!(result.is_ok()); + let returned = result.unwrap(); + assert_eq!(returned.len(), 2); + assert_eq!(returned[0].index, 2); + assert_eq!(returned[1].index, 3); + }) + }) + .collect(); + + for handle in handles { + handle.join().expect("Thread should not panic"); + } + } + + // ============================================================================ + // Tests for term() method + // ============================================================================ + + #[test] + fn test_term_index_zero_returns_zero() { + let storage = MemStorage::new(); + + // Index 0 should always return term 0 + let result = storage.term(0); + assert!(result.is_ok(), "term(0) should succeed"); + assert_eq!(result.unwrap(), 0, "term(0) should return 0"); + } + + #[test] + fn test_term_for_valid_indices_in_log() { + let storage = MemStorage::new(); + + // Add entries with different terms + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + Entry { + index: 4, + term: 3, + ..Default::default() + }, + Entry { + index: 5, + term: 3, + ..Default::default() + }, + ]; + storage.append(&entries); + + // Test term for each entry + assert_eq!(storage.term(1).unwrap(), 1, "Entry 1 should have term 1"); + assert_eq!(storage.term(2).unwrap(), 1, "Entry 2 should have term 1"); + assert_eq!(storage.term(3).unwrap(), 2, "Entry 3 should have term 2"); + assert_eq!(storage.term(4).unwrap(), 3, "Entry 4 should have term 3"); + assert_eq!(storage.term(5).unwrap(), 3, "Entry 5 should have term 3"); + } + + #[test] + fn test_term_for_snapshot_index() { + let storage = MemStorage::new(); + + // Create a snapshot at index 5 with term 2 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 5; + snapshot.mut_metadata().term = 2; + *storage.snapshot.write().unwrap() = snapshot; + + // Add entries starting from index 6 + let entries = vec![ + Entry { + index: 6, + term: 2, + ..Default::default() + }, + Entry { + index: 7, + term: 3, + ..Default::default() + }, + ]; + storage.append(&entries); + + // Query term for snapshot index should return snapshot term + let result = storage.term(5); + assert!(result.is_ok(), "term(snapshot_index) should succeed"); + assert_eq!(result.unwrap(), 2, "Should return snapshot term"); + } + + #[test] + fn test_term_error_for_compacted_index() { + let storage = MemStorage::new(); + + // Create a snapshot at index 5 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 5; + snapshot.mut_metadata().term = 2; + *storage.snapshot.write().unwrap() = snapshot; + + // Add entries starting from index 6 + let entries = vec![ + Entry { + index: 6, + term: 2, + ..Default::default() + }, + Entry { + index: 7, + term: 3, + ..Default::default() + }, + ]; + storage.append(&entries); + + // first_index() should be 6 (snapshot.index + 1) + // Requesting term for index before that should fail + let result = storage.term(4); + assert!(result.is_err(), "Should error for compacted index"); + + match result.unwrap_err() { + raft::Error::Store(StorageError::Compacted) => { + // Expected error + } + other => panic!("Expected StorageError::Compacted, got {other:?}"), + } + } + + #[test] + fn test_term_error_for_unavailable_index() { + let storage = MemStorage::new(); + + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.append(&entries); + + // last_index() is 3 + // Requesting term for index > 3 should fail + let result = storage.term(4); + assert!(result.is_err(), "Should error for unavailable index"); + + match result.unwrap_err() { + raft::Error::Store(StorageError::Unavailable) => { + // Expected error + } + other => panic!("Expected StorageError::Unavailable, got {other:?}"), + } + } + + #[test] + fn test_term_on_empty_storage() { + let storage = MemStorage::new(); + + // Index 0 should work + assert_eq!(storage.term(0).unwrap(), 0, "term(0) should return 0"); + + // Any positive index should fail with Unavailable + let result = storage.term(1); + assert!(result.is_err(), "Should error for index beyond empty log"); + + match result.unwrap_err() { + raft::Error::Store(StorageError::Unavailable) => { + // Expected + } + other => panic!("Expected StorageError::Unavailable, got {other:?}"), + } + } + + #[test] + fn test_term_thread_safety() { + let storage = Arc::new(MemStorage::new()); + + // Populate storage + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 2, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + Entry { + index: 4, + term: 3, + ..Default::default() + }, + Entry { + index: 5, + term: 3, + ..Default::default() + }, + ]; + storage.append(&entries); + + // Spawn multiple threads reading terms concurrently + let handles: Vec<_> = (0..10) + .map(|_| { + let storage_clone = Arc::clone(&storage); + thread::spawn(move || { + assert_eq!(storage_clone.term(0).unwrap(), 0); + assert_eq!(storage_clone.term(1).unwrap(), 1); + assert_eq!(storage_clone.term(2).unwrap(), 2); + assert_eq!(storage_clone.term(3).unwrap(), 2); + assert_eq!(storage_clone.term(4).unwrap(), 3); + assert_eq!(storage_clone.term(5).unwrap(), 3); + }) + }) + .collect(); + + for handle in handles { + handle.join().expect("Thread should not panic"); + } + } + + #[test] + fn test_term_boundary_conditions() { + let storage = MemStorage::new(); + + // Add a single entry + let entries = vec![Entry { + index: 1, + term: 5, + ..Default::default() + }]; + storage.append(&entries); + + // Test boundaries + assert_eq!(storage.term(0).unwrap(), 0, "Index 0 returns 0"); + assert_eq!(storage.term(1).unwrap(), 5, "Index 1 returns correct term"); + + // Index 2 should be unavailable + let result = storage.term(2); + assert!(result.is_err(), "Index beyond last should error"); + match result.unwrap_err() { + raft::Error::Store(StorageError::Unavailable) => { + // Expected + } + other => panic!("Expected StorageError::Unavailable, got {other:?}"), + } + } + + #[test] + fn test_term_with_snapshot_but_no_entries() { + let storage = MemStorage::new(); + + // Create a snapshot at index 10 with term 5 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 10; + snapshot.mut_metadata().term = 5; + *storage.snapshot.write().unwrap() = snapshot; + + // No entries added, only snapshot exists + + // Index 0 should work + assert_eq!(storage.term(0).unwrap(), 0, "Index 0 returns 0"); + + // Snapshot index should return snapshot term + assert_eq!( + storage.term(10).unwrap(), + 5, + "Snapshot index returns snapshot term" + ); + + // Indices before snapshot should be compacted + let result = storage.term(9); + assert!(result.is_err(), "Index before snapshot should be compacted"); + match result.unwrap_err() { + raft::Error::Store(StorageError::Compacted) => { + // Expected + } + other => panic!("Expected StorageError::Compacted, got {other:?}"), + } + + // Indices after snapshot should be unavailable + let result = storage.term(11); + assert!( + result.is_err(), + "Index after snapshot should be unavailable" + ); + match result.unwrap_err() { + raft::Error::Store(StorageError::Unavailable) => { + // Expected + } + other => panic!("Expected StorageError::Unavailable, got {other:?}"), + } + } + + // ============================================================================ + // Tests for first_index() method + // ============================================================================ + + #[test] + fn test_first_index_empty_log() { + let storage = MemStorage::new(); + + // Empty log should return 1 as the default first index + let result = storage.first_index(); + assert!(result.is_ok(), "first_index should succeed on empty log"); + assert_eq!(result.unwrap(), 1, "Empty log should have first_index = 1"); + } + + #[test] + fn test_first_index_after_append() { + let storage = MemStorage::new(); + + // Append entries starting at index 1 + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.append(&entries); + + let result = storage.first_index(); + assert!(result.is_ok(), "first_index should succeed"); + assert_eq!( + result.unwrap(), + 1, + "first_index should be 1 when entries start at 1" + ); + } + + #[test] + fn test_first_index_with_snapshot() { + let storage = MemStorage::new(); + + // Create a snapshot at index 10 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 10; + snapshot.mut_metadata().term = 3; + *storage.snapshot.write().unwrap() = snapshot; + + // No entries yet, first_index should be snapshot.index + 1 + let result = storage.first_index(); + assert!(result.is_ok(), "first_index should succeed with snapshot"); + assert_eq!( + result.unwrap(), + 11, + "first_index should be snapshot.index + 1" + ); + } + + #[test] + fn test_first_index_with_snapshot_and_entries() { + let storage = MemStorage::new(); + + // Create a snapshot at index 10 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 10; + snapshot.mut_metadata().term = 3; + *storage.snapshot.write().unwrap() = snapshot; + + // Add entries starting from index 11 + let entries = vec![ + Entry { + index: 11, + term: 3, + ..Default::default() + }, + Entry { + index: 12, + term: 3, + ..Default::default() + }, + Entry { + index: 13, + term: 4, + ..Default::default() + }, + ]; + storage.append(&entries); + + // first_index should still be snapshot.index + 1 + let result = storage.first_index(); + assert!(result.is_ok(), "first_index should succeed"); + assert_eq!( + result.unwrap(), + 11, + "first_index should be snapshot.index + 1 even with entries" + ); + } + + #[test] + fn test_first_index_after_compaction() { + let storage = MemStorage::new(); + + // Simulate log compaction by: + // 1. Creating a snapshot at index 50 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 50; + snapshot.mut_metadata().term = 10; + *storage.snapshot.write().unwrap() = snapshot; + + // 2. Adding new entries after the snapshot + let entries = vec![ + Entry { + index: 51, + term: 10, + ..Default::default() + }, + Entry { + index: 52, + term: 11, + ..Default::default() + }, + ]; + storage.append(&entries); + + let result = storage.first_index(); + assert!( + result.is_ok(), + "first_index should succeed after compaction" + ); + assert_eq!( + result.unwrap(), + 51, + "first_index should be 51 after compaction at index 50" + ); + } + + #[test] + fn test_first_index_with_entries_not_starting_at_one() { + let storage = MemStorage::new(); + + // Directly append entries that don't start at index 1 + // (simulating entries after compaction) + let entries = vec![ + Entry { + index: 20, + term: 5, + ..Default::default() + }, + Entry { + index: 21, + term: 5, + ..Default::default() + }, + ]; + storage.append(&entries); + + // Without a snapshot, first_index should return the first entry's index + let result = storage.first_index(); + assert!(result.is_ok(), "first_index should succeed"); + assert_eq!( + result.unwrap(), + 20, + "first_index should match first entry index" + ); + } + + // ============================================================================ + // Tests for last_index() method + // ============================================================================ + + #[test] + fn test_last_index_empty_log() { + let storage = MemStorage::new(); + + // Empty log should return 0 as the last index + let result = storage.last_index(); + assert!(result.is_ok(), "last_index should succeed on empty log"); + assert_eq!(result.unwrap(), 0, "Empty log should have last_index = 0"); + } + + #[test] + fn test_last_index_after_append() { + let storage = MemStorage::new(); + + // Append entries + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.append(&entries); + + let result = storage.last_index(); + assert!(result.is_ok(), "last_index should succeed"); + assert_eq!( + result.unwrap(), + 3, + "last_index should be the index of the last entry" + ); + } + + #[test] + fn test_last_index_snapshot_only() { + let storage = MemStorage::new(); + + // Create a snapshot at index 10, no entries + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 10; + snapshot.mut_metadata().term = 3; + *storage.snapshot.write().unwrap() = snapshot; + + // With no entries, last_index should return snapshot.index + let result = storage.last_index(); + assert!( + result.is_ok(), + "last_index should succeed with snapshot only" + ); + assert_eq!( + result.unwrap(), + 10, + "last_index should be snapshot.index when no entries exist" + ); + } + + #[test] + fn test_last_index_with_snapshot_and_entries() { + let storage = MemStorage::new(); + + // Create a snapshot at index 10 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 10; + snapshot.mut_metadata().term = 3; + *storage.snapshot.write().unwrap() = snapshot; + + // Add entries after the snapshot + let entries = vec![ + Entry { + index: 11, + term: 3, + ..Default::default() + }, + Entry { + index: 12, + term: 3, + ..Default::default() + }, + Entry { + index: 13, + term: 4, + ..Default::default() + }, + ]; + storage.append(&entries); + + // last_index should return the last entry's index, not the snapshot + let result = storage.last_index(); + assert!(result.is_ok(), "last_index should succeed"); + assert_eq!( + result.unwrap(), + 13, + "last_index should be the last entry index, not snapshot index" + ); + } + + #[test] + fn test_last_index_after_multiple_appends() { + let storage = MemStorage::new(); + + // First append + let entries1 = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + ]; + storage.append(&entries1); + + assert_eq!( + storage.last_index().unwrap(), + 2, + "After first append, last_index should be 2" + ); + + // Second append + let entries2 = vec![ + Entry { + index: 3, + term: 2, + ..Default::default() + }, + Entry { + index: 4, + term: 2, + ..Default::default() + }, + Entry { + index: 5, + term: 3, + ..Default::default() + }, + ]; + storage.append(&entries2); + + assert_eq!( + storage.last_index().unwrap(), + 5, + "After second append, last_index should be 5" + ); + } + + #[test] + fn test_last_index_single_entry() { + let storage = MemStorage::new(); + + // Append a single entry + let entries = vec![Entry { + index: 1, + term: 1, + ..Default::default() + }]; + storage.append(&entries); + + let result = storage.last_index(); + assert!( + result.is_ok(), + "last_index should succeed with single entry" + ); + assert_eq!( + result.unwrap(), + 1, + "last_index should be 1 for single entry" + ); + } + + // ============================================================================ + // Tests for first_index() and last_index() invariants + // ============================================================================ + + #[test] + fn test_first_last_index_invariant() { + // Test the invariant: first_index <= last_index + 1 + // This should hold in all valid states + + let storage = MemStorage::new(); + + // Case 1: Empty log + let first = storage.first_index().unwrap(); + let last = storage.last_index().unwrap(); + assert!( + first <= last + 1, + "Empty log: first_index ({first}) <= last_index ({last}) + 1" + ); + + // Case 2: After appending entries + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.append(&entries); + + let first = storage.first_index().unwrap(); + let last = storage.last_index().unwrap(); + assert!( + first <= last + 1, + "With entries: first_index ({first}) <= last_index ({last}) + 1" + ); + + // Case 3: With snapshot (need to clear old entries to simulate proper compaction) + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 10; + snapshot.mut_metadata().term = 3; + *storage.snapshot.write().unwrap() = snapshot; + // Clear old entries that are covered by the snapshot + storage.entries.write().unwrap().clear(); + + let first = storage.first_index().unwrap(); + let last = storage.last_index().unwrap(); + assert!( + first <= last + 1, + "With snapshot: first_index ({first}) <= last_index ({last}) + 1" + ); + + // Case 4: With snapshot and new entries + let entries = vec![ + Entry { + index: 11, + term: 3, + ..Default::default() + }, + Entry { + index: 12, + term: 4, + ..Default::default() + }, + ]; + storage.append(&entries); + + let first = storage.first_index().unwrap(); + let last = storage.last_index().unwrap(); + assert!( + first <= last + 1, + "With snapshot and entries: first_index ({first}) <= last_index ({last}) + 1" + ); + } + + #[test] + fn test_first_last_index_boundaries() { + let storage = MemStorage::new(); + + // Empty log special case + assert_eq!(storage.first_index().unwrap(), 1); + assert_eq!(storage.last_index().unwrap(), 0); + // This is the one case where first > last, but first <= last + 1 still holds + + // Single entry + storage.append(&[Entry { + index: 1, + term: 1, + ..Default::default() + }]); + assert_eq!(storage.first_index().unwrap(), 1); + assert_eq!(storage.last_index().unwrap(), 1); + + // Multiple entries + storage.append(&[ + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 1, + ..Default::default() + }, + ]); + assert_eq!(storage.first_index().unwrap(), 1); + assert_eq!(storage.last_index().unwrap(), 3); + } + + #[test] + fn test_first_last_index_thread_safety() { + let storage = Arc::new(MemStorage::new()); + + // Populate storage + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.append(&entries); + + // Spawn multiple threads reading first_index and last_index concurrently + let handles: Vec<_> = (0..10) + .map(|_| { + let storage_clone = Arc::clone(&storage); + thread::spawn(move || { + let first = storage_clone.first_index().unwrap(); + let last = storage_clone.last_index().unwrap(); + assert_eq!(first, 1, "first_index should be 1"); + assert_eq!(last, 3, "last_index should be 3"); + assert!( + first <= last + 1, + "Invariant should hold: first_index <= last_index + 1" + ); + }) + }) + .collect(); + + for handle in handles { + handle.join().expect("Thread should not panic"); + } + } + + #[test] + fn test_first_last_index_consistency() { + let storage = MemStorage::new(); + + // Test that multiple consecutive calls return the same values + for _ in 0..100 { + let first1 = storage.first_index().unwrap(); + let last1 = storage.last_index().unwrap(); + let first2 = storage.first_index().unwrap(); + let last2 = storage.last_index().unwrap(); + + assert_eq!(first1, first2, "Consecutive first_index calls should match"); + assert_eq!(last1, last2, "Consecutive last_index calls should match"); + } + + // Add entries and test again + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + ]; + storage.append(&entries); + + for _ in 0..100 { + let first1 = storage.first_index().unwrap(); + let last1 = storage.last_index().unwrap(); + let first2 = storage.first_index().unwrap(); + let last2 = storage.last_index().unwrap(); + + assert_eq!(first1, first2, "Consecutive first_index calls should match"); + assert_eq!(last1, last2, "Consecutive last_index calls should match"); + } + } + + #[test] + fn test_first_last_index_with_large_snapshot() { + let storage = MemStorage::new(); + + // Create a snapshot at a large index + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 1_000_000; + snapshot.mut_metadata().term = 100; + *storage.snapshot.write().unwrap() = snapshot; + + let first = storage.first_index().unwrap(); + let last = storage.last_index().unwrap(); + + assert_eq!(first, 1_000_001, "first_index should be snapshot.index + 1"); + assert_eq!(last, 1_000_000, "last_index should be snapshot.index"); + assert!( + first <= last + 1, + "Invariant should hold even with large indices" + ); + } + + #[test] + fn test_first_last_index_multiple_scenarios() { + let storage = MemStorage::new(); + + // Scenario 1: Empty + assert_eq!(storage.first_index().unwrap(), 1); + assert_eq!(storage.last_index().unwrap(), 0); + + // Scenario 2: Add entries + storage.append(&[ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + ]); + assert_eq!(storage.first_index().unwrap(), 1); + assert_eq!(storage.last_index().unwrap(), 2); + + // Scenario 3: Add more entries + storage.append(&[ + Entry { + index: 3, + term: 2, + ..Default::default() + }, + Entry { + index: 4, + term: 2, + ..Default::default() + }, + Entry { + index: 5, + term: 3, + ..Default::default() + }, + ]); + assert_eq!(storage.first_index().unwrap(), 1); + assert_eq!(storage.last_index().unwrap(), 5); + + // Scenario 4: Add snapshot (simulate compaction) + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 3; + snapshot.mut_metadata().term = 2; + *storage.snapshot.write().unwrap() = snapshot; + assert_eq!(storage.first_index().unwrap(), 4); + assert_eq!(storage.last_index().unwrap(), 5); + + // Scenario 5: Add more entries after snapshot + storage.append(&[ + Entry { + index: 6, + term: 3, + ..Default::default() + }, + Entry { + index: 7, + term: 4, + ..Default::default() + }, + ]); + assert_eq!(storage.first_index().unwrap(), 4); + assert_eq!(storage.last_index().unwrap(), 7); + } + + // ============================================================================ + // Tests for snapshot() method + // ============================================================================ + + #[test] + fn test_snapshot_returns_default_on_new_storage() { + let storage = MemStorage::new(); + + // Empty storage should return default snapshot + let result = storage.snapshot(0); + assert!(result.is_ok(), "snapshot() should succeed on new storage"); + + let snapshot = result.unwrap(); + assert_eq!( + snapshot.get_metadata().index, + 0, + "Default snapshot should have index 0" + ); + assert_eq!( + snapshot.get_metadata().term, + 0, + "Default snapshot should have term 0" + ); + assert!( + snapshot.data.is_empty(), + "Default snapshot should have empty data" + ); + } + + #[test] + fn test_snapshot_returns_stored_snapshot() { + let storage = MemStorage::new(); + + // Create and store a snapshot + let mut snap = Snapshot::default(); + snap.mut_metadata().index = 10; + snap.mut_metadata().term = 3; + snap.data = vec![1, 2, 3, 4, 5]; + *storage.snapshot.write().unwrap() = snap; + + // Retrieve snapshot + let result = storage.snapshot(0); + assert!(result.is_ok(), "snapshot() should succeed"); + + let retrieved = result.unwrap(); + assert_eq!( + retrieved.get_metadata().index, + 10, + "Should return stored snapshot index" + ); + assert_eq!( + retrieved.get_metadata().term, + 3, + "Should return stored snapshot term" + ); + assert_eq!( + retrieved.data, + vec![1, 2, 3, 4, 5], + "Should return stored snapshot data" + ); + } + + #[test] + fn test_snapshot_ignores_request_index_in_phase_1() { + let storage = MemStorage::new(); + + // Store a snapshot at index 10 + let mut snap = Snapshot::default(); + snap.mut_metadata().index = 10; + snap.mut_metadata().term = 3; + *storage.snapshot.write().unwrap() = snap; + + // Request snapshot with different request_index values + // In Phase 1, all should return the same snapshot + let snap0 = storage.snapshot(0).unwrap(); + let snap5 = storage.snapshot(5).unwrap(); + let snap10 = storage.snapshot(10).unwrap(); + let snap100 = storage.snapshot(100).unwrap(); + + // All should be identical + assert_eq!(snap0.get_metadata().index, 10); + assert_eq!(snap5.get_metadata().index, 10); + assert_eq!(snap10.get_metadata().index, 10); + assert_eq!(snap100.get_metadata().index, 10); + } + + #[test] + fn test_snapshot_with_metadata() { + let storage = MemStorage::new(); + + // Create snapshot with complex metadata + let mut snap = Snapshot::default(); + snap.mut_metadata().index = 42; + snap.mut_metadata().term = 7; + + // Set configuration in metadata + snap.mut_metadata().conf_state = Some(ConfState { + voters: vec![1, 2, 3], + learners: vec![4, 5], + ..Default::default() + }); + + *storage.snapshot.write().unwrap() = snap; + + // Retrieve and verify + let retrieved = storage.snapshot(0).unwrap(); + assert_eq!(retrieved.get_metadata().index, 42); + assert_eq!(retrieved.get_metadata().term, 7); + assert_eq!( + retrieved.get_metadata().conf_state.as_ref().unwrap().voters, + vec![1, 2, 3] + ); + assert_eq!( + retrieved + .get_metadata() + .conf_state + .as_ref() + .unwrap() + .learners, + vec![4, 5] + ); + } + + #[test] + fn test_snapshot_with_data() { + let storage = MemStorage::new(); + + // Create snapshot with substantial data + let mut snap = Snapshot::default(); + snap.mut_metadata().index = 100; + snap.mut_metadata().term = 10; + snap.data = vec![0; 10_000]; // 10KB of data + *storage.snapshot.write().unwrap() = snap; + + // Retrieve and verify + let retrieved = storage.snapshot(0).unwrap(); + assert_eq!(retrieved.get_metadata().index, 100); + assert_eq!(retrieved.get_metadata().term, 10); + assert_eq!(retrieved.data.len(), 10_000); + assert!(retrieved.data.iter().all(|&b| b == 0)); + } + + #[test] + fn test_snapshot_returns_cloned_data() { + let storage = MemStorage::new(); + + // Store initial snapshot + let mut snap = Snapshot::default(); + snap.mut_metadata().index = 5; + snap.mut_metadata().term = 2; + snap.data = vec![1, 2, 3]; + *storage.snapshot.write().unwrap() = snap; + + // Get first snapshot + let snap1 = storage.snapshot(0).unwrap(); + + // Modify storage snapshot + let mut new_snap = Snapshot::default(); + new_snap.mut_metadata().index = 10; + new_snap.mut_metadata().term = 5; + new_snap.data = vec![4, 5, 6]; + *storage.snapshot.write().unwrap() = new_snap; + + // Get second snapshot + let snap2 = storage.snapshot(0).unwrap(); + + // Verify snap1 is unaffected by later changes + assert_eq!( + snap1.get_metadata().index, + 5, + "First snapshot should be unaffected" + ); + assert_eq!( + snap1.get_metadata().term, + 2, + "First snapshot term should be unaffected" + ); + assert_eq!( + snap1.data, + vec![1, 2, 3], + "First snapshot data should be unaffected" + ); + + // Verify snap2 has new values + assert_eq!( + snap2.get_metadata().index, + 10, + "Second snapshot should have new values" + ); + assert_eq!( + snap2.get_metadata().term, + 5, + "Second snapshot should have new term" + ); + assert_eq!( + snap2.data, + vec![4, 5, 6], + "Second snapshot should have new data" + ); + } + + #[test] + fn test_snapshot_is_thread_safe() { + let storage = Arc::new(MemStorage::new()); + + // Store a snapshot + let mut snap = Snapshot::default(); + snap.mut_metadata().index = 20; + snap.mut_metadata().term = 4; + snap.data = vec![10, 20, 30, 40, 50]; + *storage.snapshot.write().unwrap() = snap; + + // Spawn multiple threads reading snapshot concurrently + let handles: Vec<_> = (0..10) + .map(|_| { + let storage_clone = Arc::clone(&storage); + thread::spawn(move || { + // Each thread reads the snapshot 100 times + for request_idx in 0..100 { + let result = storage_clone.snapshot(request_idx); + assert!(result.is_ok(), "snapshot() should succeed"); + + let snapshot = result.unwrap(); + assert_eq!( + snapshot.get_metadata().index, + 20, + "Snapshot index should be consistent" + ); + assert_eq!( + snapshot.get_metadata().term, + 4, + "Snapshot term should be consistent" + ); + assert_eq!( + snapshot.data, + vec![10, 20, 30, 40, 50], + "Snapshot data should be consistent" + ); + } + }) + }) + .collect(); + + // Wait for all threads to complete + for handle in handles { + handle.join().expect("Thread should not panic"); + } + } + + // ============================================================================ + // Tests for apply_snapshot() method + // ============================================================================ + + #[test] + fn test_apply_snapshot_replaces_all_state() { + let storage = MemStorage::new(); + + // Add some initial entries + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.append(&entries); + + // Create a snapshot at index 5 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 5; + snapshot.mut_metadata().term = 3; + snapshot.mut_metadata().conf_state = Some(ConfState { + voters: vec![1, 2, 3], + ..Default::default() + }); + snapshot.data = vec![10, 20, 30]; + + // Apply snapshot + let result = storage.apply_snapshot(snapshot.clone()); + assert!(result.is_ok(), "apply_snapshot should succeed"); + + // Verify snapshot was stored + let stored_snap = storage.snapshot(0).unwrap(); + assert_eq!(stored_snap.get_metadata().index, 5); + assert_eq!(stored_snap.get_metadata().term, 3); + assert_eq!(stored_snap.data, vec![10, 20, 30]); + + // Verify entries covered by snapshot were removed + let remaining_entries = storage.entries.read().unwrap(); + assert!( + remaining_entries.is_empty(), + "All entries should be removed as they are covered by snapshot" + ); + } + + #[test] + fn test_apply_snapshot_clears_entries_covered_by_snapshot() { + let storage = MemStorage::new(); + + // Add entries 1-10 + let entries: Vec = (1..=10) + .map(|i| Entry { + index: i, + term: 1, + ..Default::default() + }) + .collect(); + storage.append(&entries); + + // Apply snapshot at index 5 + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 5; + snapshot.mut_metadata().term = 2; + + storage.apply_snapshot(snapshot).unwrap(); + + // Only entries 6-10 should remain + let remaining = storage.entries.read().unwrap(); + assert_eq!( + remaining.len(), + 5, + "Only entries after snapshot should remain" + ); + assert_eq!( + remaining[0].index, 6, + "First remaining entry should be index 6" + ); + assert_eq!( + remaining[4].index, 10, + "Last remaining entry should be index 10" + ); + } + + #[test] + fn test_apply_snapshot_updates_hard_state() { + let storage = MemStorage::new(); + + // Set initial hard state + storage.set_hard_state(HardState { + term: 1, + vote: 1, + commit: 2, + }); + + // Apply snapshot with higher term and commit + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 10; + snapshot.mut_metadata().term = 5; + + storage.apply_snapshot(snapshot).unwrap(); + + // Verify hard state was updated + let hard_state = storage.hard_state.read().unwrap(); + assert_eq!( + hard_state.term, 5, + "Term should be updated to snapshot term" + ); + assert_eq!( + hard_state.commit, 10, + "Commit should be updated to snapshot index" + ); + } + + #[test] + fn test_apply_snapshot_preserves_higher_hard_state_values() { + let storage = MemStorage::new(); + + // Set high commit + storage.set_hard_state(HardState { + term: 10, + vote: 1, + commit: 20, + }); + + // Apply snapshot with lower values + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 5; + snapshot.mut_metadata().term = 3; + + storage.apply_snapshot(snapshot).unwrap(); + + // Verify higher values were preserved + let hard_state = storage.hard_state.read().unwrap(); + assert_eq!(hard_state.term, 10, "Higher term should be preserved"); + assert_eq!(hard_state.commit, 20, "Higher commit should be preserved"); + } + + #[test] + fn test_apply_snapshot_updates_conf_state() { + let storage = MemStorage::new(); + + // Set initial conf state + storage.set_conf_state(ConfState { + voters: vec![1, 2], + learners: vec![3], + ..Default::default() + }); + + // Apply snapshot with different conf state + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 10; + snapshot.mut_metadata().term = 5; + snapshot.mut_metadata().conf_state = Some(ConfState { + voters: vec![4, 5, 6], + learners: vec![7, 8], + ..Default::default() + }); + + storage.apply_snapshot(snapshot).unwrap(); + + // Verify conf state was updated + let conf_state = storage.conf_state.read().unwrap(); + assert_eq!( + conf_state.voters, + vec![4, 5, 6], + "Voters should be updated from snapshot" + ); + assert_eq!( + conf_state.learners, + vec![7, 8], + "Learners should be updated from snapshot" + ); + } + + #[test] + fn test_apply_snapshot_with_no_conf_state_in_metadata() { + let storage = MemStorage::new(); + + // Set initial conf state + storage.set_conf_state(ConfState { + voters: vec![1, 2, 3], + ..Default::default() + }); + + // Apply snapshot without conf_state in metadata + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 10; + snapshot.mut_metadata().term = 5; + // Don't set conf_state + + storage.apply_snapshot(snapshot).unwrap(); + + // Verify conf state was not changed + let conf_state = storage.conf_state.read().unwrap(); + assert_eq!( + conf_state.voters, + vec![1, 2, 3], + "Conf state should remain unchanged when snapshot has no conf_state" + ); + } + + #[test] + fn test_apply_snapshot_thread_safety() { + let storage = Arc::new(MemStorage::new()); + + // Add initial entries + let entries: Vec = (1..=20) + .map(|i| Entry { + index: i, + term: 1, + ..Default::default() + }) + .collect(); + storage.append(&entries); + + // Create multiple snapshots + let snapshots: Vec = (1..=5) + .map(|i| { + let mut snap = Snapshot::default(); + snap.mut_metadata().index = i * 5; + snap.mut_metadata().term = i; + snap.data = vec![i as u8; 100]; + snap + }) + .collect(); + + // Apply snapshots concurrently (should be serialized by write locks) + let handles: Vec<_> = snapshots + .into_iter() + .map(|snap| { + let storage_clone = Arc::clone(&storage); + thread::spawn(move || { + storage_clone.apply_snapshot(snap).unwrap(); + }) + }) + .collect(); + + // Wait for all threads + for handle in handles { + handle.join().expect("Thread should not panic"); + } + + // Verify final state is consistent (one of the snapshots was applied) + let final_snap = storage.snapshot(0).unwrap(); + assert!( + final_snap.get_metadata().index > 0, + "A snapshot should have been applied" + ); + + // Verify entries are consistent with snapshot + let entries = storage.entries.read().unwrap(); + if !entries.is_empty() { + assert!( + entries[0].index > final_snap.get_metadata().index, + "Remaining entries should be after snapshot index" + ); + } + } + + #[test] + fn test_apply_snapshot_empty_log() { + let storage = MemStorage::new(); + + // Apply snapshot on empty log + let mut snapshot = Snapshot::default(); + snapshot.mut_metadata().index = 5; + snapshot.mut_metadata().term = 2; + snapshot.data = vec![1, 2, 3]; + + let result = storage.apply_snapshot(snapshot.clone()); + assert!(result.is_ok(), "apply_snapshot should succeed on empty log"); + + // Verify snapshot was stored + let stored = storage.snapshot(0).unwrap(); + assert_eq!(stored.get_metadata().index, 5); + assert_eq!(stored.get_metadata().term, 2); + assert_eq!(stored.data, vec![1, 2, 3]); + } + + // ============================================================================ + // Tests for wl_append_entries() method + // ============================================================================ + + #[test] + fn test_wl_append_entries_to_empty_log() { + let storage = MemStorage::new(); + + // Append to empty log + let entries = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + + let result = storage.wl_append_entries(&entries); + assert!(result.is_ok(), "wl_append_entries should succeed"); + + // Verify entries were appended + assert_eq!(storage.last_index().unwrap(), 3); + let stored = storage.entries.read().unwrap(); + assert_eq!(stored.len(), 3); + assert_eq!(stored[0].index, 1); + assert_eq!(stored[2].index, 3); + } + + #[test] + fn test_wl_append_entries_after_existing_entries() { + let storage = MemStorage::new(); + + // Add initial entries + let entries1 = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries1).unwrap(); + + // Append more entries after existing ones + let entries2 = vec![ + Entry { + index: 3, + term: 2, + ..Default::default() + }, + Entry { + index: 4, + term: 2, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries2).unwrap(); + + // Verify all entries are present + assert_eq!(storage.last_index().unwrap(), 4); + let stored = storage.entries.read().unwrap(); + assert_eq!(stored.len(), 4); + assert_eq!(stored[0].index, 1); + assert_eq!(stored[3].index, 4); + } + + #[test] + fn test_wl_append_entries_truncates_conflicting_entries() { + let storage = MemStorage::new(); + + // Add initial entries in term 1 + let entries1 = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 1, + ..Default::default() + }, + Entry { + index: 4, + term: 1, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries1).unwrap(); + + // Append conflicting entries (term 2 starting at index 2) + let entries2 = vec![ + Entry { + index: 2, + term: 2, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries2).unwrap(); + + // Verify old entries were truncated and new ones appended + assert_eq!(storage.last_index().unwrap(), 3); + let stored = storage.entries.read().unwrap(); + assert_eq!(stored.len(), 3); + assert_eq!(stored[0].index, 1); + assert_eq!(stored[0].term, 1); // First entry unchanged + assert_eq!(stored[1].index, 2); + assert_eq!(stored[1].term, 2); // Replaced with term 2 + assert_eq!(stored[2].index, 3); + assert_eq!(stored[2].term, 2); // Replaced with term 2 + } + + #[test] + fn test_wl_append_entries_no_conflict_when_terms_match() { + let storage = MemStorage::new(); + + // Add initial entries + let entries1 = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries1).unwrap(); + + // Append entries with matching terms (should not truncate) + let entries2 = vec![ + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 2, + ..Default::default() + }, + Entry { + index: 4, + term: 2, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries2).unwrap(); + + // Verify no truncation occurred, new entry was appended + assert_eq!(storage.last_index().unwrap(), 4); + let stored = storage.entries.read().unwrap(); + assert_eq!(stored.len(), 4); + assert_eq!(stored[0].term, 1); + assert_eq!(stored[1].term, 1); + assert_eq!(stored[2].term, 2); + assert_eq!(stored[3].term, 2); + } + + #[test] + fn test_wl_append_entries_empty_slice() { + let storage = MemStorage::new(); + + // Add initial entries + let entries1 = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries1).unwrap(); + + // Append empty slice (should be no-op) + let empty: Vec = vec![]; + let result = storage.wl_append_entries(&empty); + assert!(result.is_ok(), "Empty append should succeed"); + + // Verify nothing changed + assert_eq!(storage.last_index().unwrap(), 2); + let stored = storage.entries.read().unwrap(); + assert_eq!(stored.len(), 2); + } + + #[test] + fn test_wl_append_entries_before_existing_log() { + let storage = MemStorage::new(); + + // Add entries starting at index 10 + let entries1 = vec![ + Entry { + index: 10, + term: 2, + ..Default::default() + }, + Entry { + index: 11, + term: 2, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries1).unwrap(); + + // Append entries starting at index 1 (before existing log) + let entries2 = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries2).unwrap(); + + // Should replace entire log + let stored = storage.entries.read().unwrap(); + assert_eq!(stored.len(), 2); + assert_eq!(stored[0].index, 1); + assert_eq!(stored[1].index, 2); + } + + #[test] + fn test_wl_append_entries_thread_safety() { + let storage = Arc::new(MemStorage::new()); + + // Start with some initial entries using the helper method + storage.append(&[ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 1, + ..Default::default() + }, + ]); + + // Spawn multiple threads all appending the same extension + // This tests that concurrent writes are properly serialized by the write lock + let handles: Vec<_> = (0..10) + .map(|_| { + let storage_clone = Arc::clone(&storage); + thread::spawn(move || { + // All threads try to append entries 4 and 5 + let entries = vec![ + Entry { + index: 4, + term: 2, + ..Default::default() + }, + Entry { + index: 5, + term: 2, + ..Default::default() + }, + ]; + storage_clone.wl_append_entries(&entries).unwrap(); + }) + }) + .collect(); + + // Wait for all threads + for handle in handles { + handle.join().expect("Thread should not panic"); + } + + // Verify final state is consistent - should have entries 1-5, no corruption + let stored = storage.entries.read().unwrap(); + assert_eq!(stored.len(), 5, "Should have exactly 5 entries"); + assert_eq!(stored[0].index, 1); + assert_eq!(stored[3].index, 4); + assert_eq!(stored[4].index, 5); + assert_eq!(stored[3].term, 2); + assert_eq!(stored[4].term, 2); + + // Verify indices are contiguous + for i in 1..stored.len() { + assert_eq!( + stored[i].index, + stored[i - 1].index + 1, + "Indices should be contiguous" + ); + } + } + + #[test] + fn test_wl_append_entries_complex_conflict_resolution() { + let storage = MemStorage::new(); + + // Build log: [1:1, 2:1, 3:1, 4:2, 5:2] + let entries1 = vec![ + Entry { + index: 1, + term: 1, + ..Default::default() + }, + Entry { + index: 2, + term: 1, + ..Default::default() + }, + Entry { + index: 3, + term: 1, + ..Default::default() + }, + Entry { + index: 4, + term: 2, + ..Default::default() + }, + Entry { + index: 5, + term: 2, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries1).unwrap(); + + // Conflict at index 3: [3:3, 4:3, 5:3, 6:3] + let entries2 = vec![ + Entry { + index: 3, + term: 3, + ..Default::default() + }, + Entry { + index: 4, + term: 3, + ..Default::default() + }, + Entry { + index: 5, + term: 3, + ..Default::default() + }, + Entry { + index: 6, + term: 3, + ..Default::default() + }, + ]; + storage.wl_append_entries(&entries2).unwrap(); + + // Should have: [1:1, 2:1, 3:3, 4:3, 5:3, 6:3] + let stored = storage.entries.read().unwrap(); + assert_eq!(stored.len(), 6); + assert_eq!(stored[0].index, 1); + assert_eq!(stored[0].term, 1); + assert_eq!(stored[1].index, 2); + assert_eq!(stored[1].term, 1); + assert_eq!(stored[2].index, 3); + assert_eq!(stored[2].term, 3); + assert_eq!(stored[3].index, 4); + assert_eq!(stored[3].term, 3); + assert_eq!(stored[4].index, 5); + assert_eq!(stored[4].term, 3); + assert_eq!(stored[5].index, 6); + assert_eq!(stored[5].term, 3); } } diff --git a/docs/product/roadmap.md b/docs/product/roadmap.md index b3e77ed..a98f6e7 100644 --- a/docs/product/roadmap.md +++ b/docs/product/roadmap.md @@ -1,33 +1,86 @@ # Seshat Project Roadmap ## Overview -Our roadmap is structured into four progressive phases, each building upon the previous implementation and expanding the system's capabilities. +Our roadmap is structured into progressive phases. **IMPORTANT**: Phase 1 has been revised to reflect the OpenRaft migration dependency discovered during technical review. -## Phase 1: Single Shard Cluster (MVP) -- Focus: Core distributed systems patterns -- Key Deliverables: - - ✅ Redis RESP protocol support (100% complete - 39.9 hours) - - ⏳ Raft consensus implementation (0% - 19 hours estimated) - - ⏳ RocksDB storage (0% - 20 hours estimated) - - ⏳ Chaos testing (0% - 20 hours estimated) - - Basic cluster formation - - Leader election -- Status: **In Progress** - 1/4 features complete (25%) -- Progress: 38.4% by effort (39.9 of 104 estimated hours) -- Timeline: 7-10 working days remaining at current velocity - -### Phase 1 Progress Detail -| Feature | Status | Effort | Tests | Priority | -|---------|--------|--------|-------|----------| -| RESP Protocol | ✅ Complete | 39.9h | 487 passing | - | -| Raft Consensus | ⏳ Not Started | 19h est | 0 | **P1 - BLOCKER** | -| RocksDB Storage | ⏳ Not Started | 20h est | 0 | P2 - Required | -| Chaos Testing | ⏳ Not Started | 20h est | 0 | P3 - Validation | - -### Recommended Next Steps -1. **Start Raft Consensus immediately** - Critical path blocker for all integration work -2. RocksDB Storage - Required for persistence layer -3. Chaos Testing - Final validation of distributed system properties +## Phase 1: Single Shard Cluster (MVP) - REVISED + +### Current Status +- **True Progress**: ~15-20% complete (1 of 5-6 major components) +- **Timeline**: 2-3 weeks remaining (vs original 1-2 weeks) +- **Total Effort**: 56-71 hours remaining + +### Phase 1A: Foundation Migration (NEW - 2 weeks) +**Status**: ⏳ Not Started +**Priority**: BLOCKING - Must complete before other Phase 1 work + +- ⏳ **OpenRaft Migration** (0% - 15-21h) - HIGHEST PRIORITY + - Migrate from raft-rs 0.7 to openraft 0.10 + - Eliminate prost version conflicts (0.11 vs 0.14) + - Convert to async-first architecture + - Preserve StateMachine idempotency guarantees + - **Blocks**: RocksDB storage, KV service integration + - Spec: `docs/specs/openraft/` + +### Phase 1B: Storage Layer (1 week) +**Status**: ⏳ Not Started +**Dependencies**: OpenRaft Phase 1-2 must complete first + +- ⏳ **RocksDB Storage** (0% - 14-17h) - HIGH PRIORITY + - Implement 6 column families (system_raft_log, system_raft_state, system_data, data_raft_log, data_raft_state, data_kv) + - Integrate with openraft storage traits (RaftLogReader, RaftSnapshotBuilder, RaftStorage) + - Async trait implementation + - **Blocks**: KV service persistence + - Spec: `docs/specs/rocksdb/` (revised for OpenRaft) + +### Phase 1C: Service Integration (1 week) +**Status**: Partial (RESP only) +**Dependencies**: OpenRaft must be complete + +- ✅ **RESP Protocol** (100% - DONE) - 487 tests passing + - Full Redis protocol support (GET, SET, DEL, EXISTS, PING) + - Property-based testing + - Comprehensive integration tests + +- ⏳ **KV Service Layer** (0% - 11-13h) - HIGH PRIORITY + - Async command handlers with OpenRaft integration + - Input validation (key/value size limits) + - Error handling and leader redirection + - **Depends on**: OpenRaft complete + - Spec: `docs/specs/kvservice/` (revised for async API) + +- ⏳ **Main Binary Orchestration** (0% - 8-10h) - HIGH PRIORITY + - Wire RESP → KV → OpenRaft → Storage together + - TCP server on port 6379 + - Tokio async runtime integration + - 3-node cluster bootstrap + +### Phase 1D: Validation (3-4 days) +**Status**: ⏳ Not Started +**Dependencies**: Phase 1C complete + +- ⏳ **Integration Testing** (0% - 4-6h) + - End-to-end request flows + - 3-node cluster behavior validation + - Leader election and failover testing + +- ⏳ **Chaos Testing** (0% - 8-10h) - REQUIRED + - 11 chaos test scenarios + - Network partitions and node failures + - 1+ hour cluster stability test + - Pass all scenarios before Phase 1 complete + +### Success Criteria +- [ ] 3-node cluster runs stably for 1+ hours +- [ ] Pass all 11 chaos test scenarios +- [ ] Performance: >5,000 ops/sec, <10ms p99 latency +- [ ] Zero prost dependency conflicts +- [ ] OpenRaft integration complete with 85+ tests passing + +### Revised Estimates +- **Original Estimate**: 45-55 hours +- **Revised Estimate**: 56-71 hours (+24% increase) +- **Reason**: OpenRaft migration discovered as critical technical debt ## Phase 2: Multi-Shard Cluster - Focus: Horizontal scalability @@ -48,19 +101,69 @@ Our roadmap is structured into four progressive phases, each building upon the p ## Phase 4: Production Readiness - Focus: Enterprise-grade features - Key Deliverables: - - Advanced monitoring + - Advanced monitoring (OpenTelemetry) - Comprehensive security model - Multi-datacenter replication - Compliance and certification - Performance optimizations + - Linearizable reads (ReadIndex mechanism) ## Phase 5: SQL Interface - Focus: Multi-protocol support - Key Deliverables: - - PostgreSQL wire protocol implementation (`protocol-sql` crate) - - SQL service layer (`sql` crate) + - PostgreSQL wire protocol implementation + - SQL service layer - Query planning and optimization - Transaction management - Schema validation - - SQL storage using RocksDB column families - - **Operator-configurable deployment**: Single RocksDB (simple) vs separate instances (isolated) \ No newline at end of file + +## Risk Assessment + +### High Risks +1. **OpenRaft Learning Curve**: Async patterns and new API may slow development +2. **Integration Complexity**: Wiring 4 major async components with no incremental testing until complete +3. **Technical Debt Cascade**: OpenRaft may reveal additional incompatibilities + +### Mitigations +- Spec-driven development with detailed task breakdown +- TDD approach with comprehensive test coverage +- Incremental validation gates after each phase +- Early architectural validation before implementation + +## Recommended Next Steps + +**Immediate (This Week)**: +1. Begin OpenRaft migration: `/spec:implement openraft` +2. Focus on Phase 1 (Type System) as foundation + +**Week 2**: +1. Complete OpenRaft Phases 2-4 (Storage, State Machine, Network) +2. Parallel work on RocksDB spec finalization + +**Week 3**: +1. Complete OpenRaft integration (Phases 5-6) +2. Begin RocksDB implementation with openraft traits + +**Week 4**: +1. Complete RocksDB storage +2. Begin KV Service async implementation +3. Wire seshat binary integration + +**Week 5** (if needed): +1. Complete integration testing +2. Run chaos test suite +3. Performance validation + +## Progress Tracking + +Use these commands to monitor progress: +- `/spec:progress openraft` - OpenRaft migration progress +- `/spec:progress rocksdb` - RocksDB storage progress +- `/spec:progress kvservice` - KV service progress +- `/product:progress` - Overall project progress + +--- + +**Last Updated**: 2025-10-26 +**Status**: Phase 1A - OpenRaft migration (Not Started) +**Next Milestone**: Complete OpenRaft Phase 1-2 (Type System + Storage Layer) \ No newline at end of file diff --git a/docs/specs/kvservice/design.json b/docs/specs/kvservice/design.json new file mode 100644 index 0000000..33ec5e3 --- /dev/null +++ b/docs/specs/kvservice/design.json @@ -0,0 +1,502 @@ +{ + "feature": "kvservice", + "description": "KV service layer that maps RESP protocol commands to Raft consensus operations", + "requirements": { + "overview": "The KV service layer receives parsed RESP commands from the protocol layer and translates them into Raft proposals or local reads. It implements Redis command semantics, validates inputs, and handles leader redirection.", + "functional_requirements": [ + "Handle GET command - read from leader's state machine", + "Handle SET command - propose write operation via Raft", + "Handle DEL command - propose delete operation via Raft", + "Handle EXISTS command - check key existence on leader", + "Handle PING command - health check without Raft", + "Validate key size <= 256 bytes", + "Validate value size <= 64KB", + "Return MOVED error when not leader", + "Return NOQUORUM error when quorum not reachable" + ], + "non_functional_requirements": [ + "GET latency < 5ms p99 (leader reads)", + "SET latency < 10ms p99 (includes Raft replication)", + "Handle 10,000 concurrent client connections", + "Strong consistency through Raft consensus", + "No data loss on leader failure" + ], + "dependencies": { + "protocol-resp": "Complete - provides RespCommand and RespValue types", + "raft": "In progress - provides RaftNode wrapper with propose() and read_local()", + "storage": "Not started - Raft uses this for persistence", + "common": "Provides shared error types and utilities" + } + }, + "technical_needs": { + "domain_model": { + "entities": { + "KvService": "Main service struct that owns Arc, handles RESP commands and routes to appropriate handlers", + "Operation": "Enum (Set, Del) that represents state machine operations, serialized with bincode for Raft proposals", + "StateMachine": "Applies committed operations to HashMap, Vec>, owned by RaftNode", + "RaftNode": "Consensus wrapper providing propose(op), read_local(key), is_leader(), get_leader_id()" + }, + "services": { + "KvService::handle_command": "Routes RespCommand enum to appropriate handler method", + "handlers": [ + "handle_get(key) -> RespValue - Reads from leader's state machine via RaftNode::read_local()", + "handle_set(key, value) -> RespValue - Validates sizes, creates Operation::Set, calls RaftNode::propose()", + "handle_del(keys) -> RespValue - Creates Operation::Del, calls RaftNode::propose() for each key", + "handle_exists(keys) -> RespValue - Reads from state machine, returns count of existing keys", + "handle_ping(message) -> RespValue - Returns message without accessing Raft" + ], + "validation": "Validates key size (max 256 bytes) and value size (max 64KB) before Raft proposal to prevent resource exhaustion" + }, + "error_handling": { + "NotLeader": "Return RespValue::Error with MOVED when follower receives write", + "NoQuorum": "Return RespValue::Error with NOQUORUM when Raft cannot reach majority", + "KeyTooLarge": "Return RespValue::Error 'ERR key too large' when key > 256 bytes", + "ValueTooLarge": "Return RespValue::Error 'ERR value too large' when value > 64KB" + } + }, + "persistence": { + "mechanism": "Raft consensus with RocksDB backend (indirect - KV service doesn't access storage directly)", + "storage_crate": "storage crate provides 6 column families, accessed only by Raft layer", + "state_machine": "In-memory HashMap, Vec> for fast reads, Raft applies committed operations", + "access_pattern": "Writes via RaftNode::propose() -> Raft log -> majority quorum -> StateMachine::apply(). Reads via RaftNode::read_local() from leader's in-memory HashMap", + "durability": "All writes must be committed by Raft majority (2 of 3 nodes) before returning success to client" + }, + "api_specification": { + "type": "Internal Rust API (not HTTP REST)", + "interface": "KvService trait with async methods", + "protocol": "Receives RespCommand from protocol-resp crate, returns RespValue", + "routing": "seshat binary on port 6379 -> RespCodec -> KvService::handle_command() -> Raft layer", + "commands": { + "GET": "async fn handle_get(&self, key: Vec) -> Result", + "SET": "async fn handle_set(&self, key: Vec, value: Vec) -> Result", + "DEL": "async fn handle_del(&self, keys: Vec>) -> Result", + "EXISTS": "async fn handle_exists(&self, keys: Vec>) -> Result", + "PING": "async fn handle_ping(&self, message: Option>) -> Result" + } + }, + "component_architecture": { + "layers": "Protocol (RESP) -> Service (KV) -> Consensus (Raft) -> Storage (RocksDB)", + "data_flow": { + "read_path": "Client -> RespCodec -> KvService::handle_get -> RaftNode::read_local -> StateMachine::get -> RespValue", + "write_path": "Client -> RespCodec -> KvService::handle_set -> RaftNode::propose -> Raft replication (2/3 quorum) -> StateMachine::apply -> RespValue::Ok" + }, + "components": { + "KvService": "Owns Arc, validates inputs, routes commands, formats responses", + "RaftNode": "Raft consensus wrapper in raft crate, handles leader election and replication", + "StateMachine": "Part of raft crate, applies Operation::Set and Operation::Del to HashMap", + "Operation": "State machine command enum in kv crate, serialized with bincode" + }, + "threading": "Async Tokio runtime, KvService methods are async, RaftNode uses channels for thread-safe communication" + }, + "events": { + "raft_log_entries": "Each SET/DEL becomes bincode-serialized Operation in Raft log entry, replicated to followers", + "state_machine_apply": "When Raft commits entry, StateMachine::apply(Operation) modifies HashMap atomically", + "leadership_changes": "On leader election, new leader can serve reads immediately, followers return MOVED error", + "client_notifications": "No pub/sub in Phase 1 - clients poll or use blocking operations" + }, + "dependencies": { + "internal_crates": [ + "protocol-resp (complete) - RespCommand, RespValue types", + "raft (in progress) - RaftNode, StateMachine, Storage trait", + "storage (planned) - RocksDB backend for Raft persistence", + "common - Error types, NodeId, configuration structs" + ], + "external_crates": [ + "tokio 1.x - Async runtime for all I/O operations", + "bytes - Zero-copy byte buffer handling for key/value data", + "bincode - Serialize Operation enum for Raft proposals", + "thiserror - Error type definitions for KvService" + ], + "dependency_order": "protocol-resp (complete) -> raft (in progress) -> kv (this feature) -> seshat binary integration", + "blockers": "Requires RaftNode::propose() and RaftNode::read_local() from raft crate to be implemented" + }, + "testing_strategy": { + "unit_tests": "Test each handler in isolation with mock RaftNode (verify validation, error handling)", + "integration_tests": "Test KvService with real RaftNode in single-node mode (no network)", + "property_tests": "Use proptest for key/value size validation edge cases", + "end_to_end_tests": "Full 3-node cluster tests in seshat binary (handled by chaos-testing feature)", + "test_scenarios": [ + "SET on leader succeeds after quorum commit", + "GET on leader returns latest committed value", + "SET on follower returns MOVED error with leader ID", + "Key size exactly 256 bytes is accepted", + "Key size 257 bytes returns error", + "Value size exactly 64KB is accepted", + "Value size 64KB + 1 returns error", + "DEL of non-existent key returns 0", + "EXISTS on missing keys returns 0", + "PING with custom message echoes back" + ] + }, + "resource_limits": { + "max_key_size_bytes": 256, + "max_value_size_bytes": 65536, + "max_concurrent_connections": 10000, + "request_timeout_secs": 30, + "raft_rpc_timeout_secs": 5, + "enforcement": "KvService validates sizes before calling Raft to prevent resource exhaustion" + } + }, + "design": { + "architecture_overview": { + "pattern": "Distributed systems service layer with Raft consensus integration", + "description": "KvService is NOT a traditional Router -> Service -> Repository pattern. It's a distributed systems component that integrates with Raft consensus for strong consistency guarantees.", + "core_principle": "All state changes go through Raft consensus. KvService never directly accesses storage - it always routes through RaftNode.", + "data_flow": { + "write_operations": "KvService validates -> serializes Operation -> RaftNode::propose() -> Raft log replication -> majority commit -> StateMachine::apply() -> return success", + "read_operations": "KvService checks leadership -> RaftNode::get() -> StateMachine HashMap -> return value", + "leadership_routing": "Followers redirect writes to leader via MOVED error. Leaders serve both reads and writes." + } + }, + "domain_model": { + "kvservice_struct": { + "definition": "pub struct KvService { raft_node: Arc }", + "ownership": "Owns Arc for thread-safe access across async tasks", + "responsibility": "Command routing, input validation, RESP response formatting", + "no_state": "KvService is stateless - all state lives in RaftNode's StateMachine", + "thread_safety": "Clone-able via Arc, safe to use across tokio tasks" + }, + "operation_enum": { + "location": "Already implemented in crates/kv/src/operations.rs", + "variants": [ + "Set { key: Vec, value: Vec } - Insert or update key-value pair", + "Del { key: Vec } - Delete key from state machine" + ], + "serialization": "Implements Serialize/Deserialize using bincode for Raft log entries", + "apply_method": "apply(&self, state: &mut HashMap, Vec>) -> OperationResult>", + "idempotency": "SET is idempotent (last write wins). DEL is idempotent (returns 0 if key doesn't exist)." + }, + "state_machine": { + "location": "Implemented in crates/raft/src/state_machine.rs", + "storage": "In-memory HashMap, Vec> for fast reads", + "apply_logic": "Deserializes Operation from bytes, calls Operation::apply(), updates last_applied index", + "last_applied": "Tracks highest log index applied to prevent duplicate application", + "persistence": "State machine is rebuilt from Raft log on restart (or from snapshot in Phase 1+)" + }, + "raft_node": { + "location": "Implemented in crates/raft/src/node.rs", + "key_methods": [ + "propose(data: Vec) -> Result<()> - Submit write to Raft (only works on leader)", + "get(key: &[u8]) -> Option> - Read from state machine", + "is_leader() -> bool - Check if this node is leader", + "leader_id() -> Option - Get current leader ID for redirection" + ], + "consensus": "Wraps raft-rs RawNode, handles leader election, log replication, commit tracking", + "gRPC": "Integrated gRPC transport for AppendEntries, RequestVote, InstallSnapshot messages" + } + }, + "api_specification": { + "type": "Internal Rust async API (not HTTP REST or gRPC)", + "interface": "KvService methods are async and return Result", + "method_signatures": { + "handle_get": { + "signature": "async fn handle_get(&self, key: Vec) -> Result", + "behavior": "1. Check if leader via raft_node.is_leader() (optional for reads). 2. Call raft_node.get(key). 3. Return RespValue::BulkString(value) or RespValue::Null", + "errors": "No Raft proposal errors - reads are local", + "latency_target": "< 5ms p99 (in-memory HashMap lookup)" + }, + "handle_set": { + "signature": "async fn handle_set(&self, key: Vec, value: Vec) -> Result", + "behavior": "1. Validate key size <= 256 bytes. 2. Validate value size <= 64KB. 3. Create Operation::Set. 4. Serialize with bincode. 5. Call raft_node.propose(). 6. Wait for commit (via ready loop). 7. Return RespValue::Ok", + "errors": "KeyTooLarge, ValueTooLarge, NotLeader, NoQuorum, ProposalFailed", + "latency_target": "< 10ms p99 (includes Raft replication to 2/3 nodes)" + }, + "handle_del": { + "signature": "async fn handle_del(&self, keys: Vec>) -> Result", + "behavior": "1. Validate each key size <= 256 bytes. 2. For each key, create Operation::Del and propose. 3. Track success count. 4. Return RespValue::Integer(deleted_count)", + "errors": "KeyTooLarge, NotLeader, NoQuorum", + "redis_semantics": "DEL accepts multiple keys, returns count of deleted keys (0 if none existed)" + }, + "handle_exists": { + "signature": "async fn handle_exists(&self, keys: Vec>) -> Result", + "behavior": "1. For each key, call raft_node.get(key). 2. Count how many exist. 3. Return RespValue::Integer(exists_count)", + "errors": "None (reads are always safe)", + "redis_semantics": "EXISTS accepts multiple keys, returns count of existing keys" + }, + "handle_ping": { + "signature": "async fn handle_ping(&self, message: Option>) -> Result", + "behavior": "1. If message is Some(msg), return RespValue::BulkString(msg). 2. If None, return RespValue::SimpleString(\"PONG\")", + "errors": "None (no Raft interaction)", + "redis_semantics": "PING without args returns PONG. PING echoes message." + } + } + }, + "component_structure": { + "kvservice_module": { + "file": "crates/kv/src/service.rs (new file to create)", + "exports": "pub struct KvService, pub enum KvServiceError", + "dependencies": [ + "use crate::Operation - from operations.rs", + "use seshat_raft::RaftNode - from raft crate", + "use seshat_protocol_resp::{RespCommand, RespValue} - from protocol-resp crate", + "use std::sync::Arc - for thread-safe RaftNode sharing", + "use thiserror::Error - for error type definitions" + ] + }, + "error_types": { + "definition": "pub enum KvServiceError", + "variants": [ + "KeyTooLarge { size: usize } - Key exceeds 256 bytes", + "ValueTooLarge { size: usize } - Value exceeds 64KB", + "NotLeader { leader_id: Option } - Write on follower, redirect to leader", + "NoQuorum - Cannot reach Raft majority", + "ProposalFailed(String) - Raft proposal rejected", + "SerializationError(bincode::Error) - Operation serialization failed", + "RaftError(Box) - Internal Raft error" + ], + "to_resp_value": "impl method to convert KvServiceError to RespValue::Error for client responses" + }, + "validation_logic": { + "max_key_size": "const MAX_KEY_SIZE: usize = 256; // bytes", + "max_value_size": "const MAX_VALUE_SIZE: usize = 65536; // 64 KB", + "enforcement": "validate_key_size() and validate_value_size() helper methods", + "early_validation": "Validate BEFORE creating Operation and proposing to Raft to prevent resource exhaustion" + } + }, + "data_flow_diagrams": { + "write_path_set": { + "step_1": "TCP Client sends: SET foo bar", + "step_2": "seshat binary TcpListener receives on port 6379", + "step_3": "RespCodec parses into RespCommand::Set{key: \"foo\", value: \"bar\"}", + "step_4": "seshat binary routes to KvService::handle_set(b\"foo\", b\"bar\")", + "step_5": "KvService validates key (3 bytes) <= 256 bytes ✓", + "step_6": "KvService validates value (3 bytes) <= 64KB ✓", + "step_7": "KvService creates Operation::Set { key: b\"foo\", value: b\"bar\" }", + "step_8": "KvService serializes Operation with bincode", + "step_9": "KvService calls raft_node.propose(serialized_op)", + "step_10": "RaftNode checks is_leader(). If follower, return NotLeader error -> KvService returns MOVED ", + "step_11": "RaftNode (leader) appends entry to local Raft log", + "step_12": "RaftNode sends AppendEntries RPC via gRPC to followers (node 2, node 3)", + "step_13": "Followers append entry to their logs and respond with success", + "step_14": "Once majority (2/3) respond, RaftNode commits the entry", + "step_15": "RaftNode calls StateMachine::apply(index, serialized_op)", + "step_16": "StateMachine deserializes Operation::Set", + "step_17": "StateMachine calls op.apply(&mut hashmap) -> inserts (\"foo\", \"bar\")", + "step_18": "RaftNode signals commit success to waiting propose() call", + "step_19": "KvService receives success, returns RespValue::Ok", + "step_20": "RespCodec encodes \"+OK\\r\\n\" and sends to client" + }, + "read_path_get": { + "step_1": "TCP Client sends: GET foo", + "step_2": "seshat binary TcpListener receives on port 6379", + "step_3": "RespCodec parses into RespCommand::Get{key: \"foo\"}", + "step_4": "seshat binary routes to KvService::handle_get(b\"foo\")", + "step_5": "KvService calls raft_node.get(b\"foo\")", + "step_6": "RaftNode reads from StateMachine HashMap (in-memory, no Raft consensus needed)", + "step_7": "StateMachine returns Some(b\"bar\") if key exists, None otherwise", + "step_8": "KvService converts to RespValue::BulkString(b\"bar\") or RespValue::Null", + "step_9": "RespCodec encodes \"$3\\r\\nbar\\r\\n\" or \"$-1\\r\\n\" and sends to client", + "note": "Reads are served from leader's in-memory state machine WITHOUT Raft consensus in Phase 1. Phase 4 adds linearizable reads with ReadIndex." + }, + "follower_write_redirect": { + "step_1": "Client sends SET to follower node", + "step_2": "KvService::handle_set validates and creates Operation", + "step_3": "Calls raft_node.propose()", + "step_4": "RaftNode checks is_leader() -> false", + "step_5": "RaftNode returns NotLeader error", + "step_6": "KvService calls raft_node.leader_id() -> Some(1)", + "step_7": "KvService returns KvServiceError::NotLeader { leader_id: Some(1) }", + "step_8": "Error converted to RespValue::Error(\"-MOVED 1\\r\\n\")", + "step_9": "Client receives error, reconnects to leader node 1, retries SET" + } + }, + "error_handling_strategy": { + "validation_errors": { + "key_too_large": "Return RespValue::Error(\"ERR key too large (max 256 bytes)\") immediately, no Raft interaction", + "value_too_large": "Return RespValue::Error(\"ERR value too large (max 64KB)\") immediately, no Raft interaction", + "when": "Before creating Operation, before calling raft_node.propose()" + }, + "leadership_errors": { + "not_leader_with_known_leader": "Return RespValue::Error(\"-MOVED \\r\\n\") so client can redirect", + "not_leader_no_leader": "Return RespValue::Error(\"-NOLEADER election in progress\\r\\n\") so client can retry", + "when": "On propose() failure when this node is follower or candidate" + }, + "quorum_errors": { + "no_quorum": "Return RespValue::Error(\"-NOQUORUM cannot reach majority\\r\\n\") when Raft cannot commit", + "timeout": "If commit doesn't happen within request_timeout_secs (30s), return timeout error", + "when": "When Raft majority is unreachable (network partition, node failures)" + }, + "serialization_errors": { + "bincode_error": "Return RespValue::Error(\"ERR internal serialization error\") - should never happen with valid Operation", + "when": "Operation::serialize() fails (defensive coding)" + }, + "raft_internal_errors": { + "handling": "Log error with tracing::error!, return RespValue::Error(\"ERR internal Raft error\")", + "examples": "Storage corruption, invalid log entries, state machine inconsistencies", + "recovery": "Node should crash and restart (fail-fast principle) on invariant violations" + } + }, + "concurrency_model": { + "async_runtime": "Tokio 1.x for all I/O operations", + "kvservice_cloning": "KvService wraps Arc, is Clone-able, safe to share across tokio tasks", + "raft_node_sharing": "Arc provides thread-safe access to Raft state", + "state_machine_locking": "StateMachine uses internal RwLock for concurrent reads, exclusive writes on apply", + "propose_concurrency": "Multiple concurrent propose() calls are safe - Raft serializes them into log order", + "read_concurrency": "Reads are concurrent-safe via RwLock read locks on StateMachine HashMap", + "no_blocking": "All KvService methods are async, never block tokio runtime", + "channel_communication": "RaftNode uses tokio::sync::mpsc channels for internal message passing" + }, + "testing_strategy_detailed": { + "unit_tests": { + "file": "crates/kv/src/service.rs (in #[cfg(test)] mod tests)", + "approach": "Mock RaftNode using trait-based dependency injection or test doubles", + "test_cases": [ + "test_handle_get_returns_value_when_key_exists", + "test_handle_get_returns_null_when_key_missing", + "test_handle_set_validates_key_size", + "test_handle_set_validates_value_size", + "test_handle_set_key_exactly_256_bytes_accepted", + "test_handle_set_key_257_bytes_rejected", + "test_handle_set_value_exactly_64kb_accepted", + "test_handle_set_value_64kb_plus_1_rejected", + "test_handle_del_single_key", + "test_handle_del_multiple_keys", + "test_handle_del_nonexistent_key_returns_zero", + "test_handle_exists_multiple_keys", + "test_handle_exists_no_keys_exist_returns_zero", + "test_handle_ping_no_message_returns_pong", + "test_handle_ping_with_message_echoes_back", + "test_not_leader_error_includes_leader_id", + "test_serialization_error_handling" + ] + }, + "integration_tests": { + "file": "crates/kv/tests/integration_test.rs (new file)", + "approach": "Use real RaftNode in single-node cluster mode (no network)", + "test_cases": [ + "test_set_then_get_returns_value", + "test_del_then_get_returns_null", + "test_set_overwrites_existing_value", + "test_exists_after_set_returns_one", + "test_exists_after_del_returns_zero", + "test_multiple_sets_in_sequence", + "test_concurrent_operations" + ], + "setup": "Create RaftNode with single peer, tick until leader, create KvService, test handlers" + }, + "property_tests": { + "file": "crates/kv/tests/property_test.rs (new file)", + "approach": "Use proptest to generate random key/value sizes and binary data", + "test_cases": [ + "prop_key_size_boundary_256_bytes (test around exact boundary)", + "prop_value_size_boundary_64kb (test around exact boundary)", + "prop_binary_data_roundtrip (arbitrary bytes in keys/values)", + "prop_empty_keys_and_values (edge cases)", + "prop_utf8_and_non_utf8_data (mixed encodings)" + ] + }, + "end_to_end_tests": { + "location": "Handled by seshat binary integration tests and chaos-testing feature", + "scenarios": "Full 3-node cluster with real networking, partitions, failures", + "out_of_scope": "Not part of kv crate testing - tested at system level" + } + }, + "performance_considerations": { + "read_latency": { + "target": "< 5ms p99 for GET on leader", + "optimization": "In-memory HashMap lookup (no disk I/O, no network)", + "bottleneck": "RwLock contention under high read concurrency", + "mitigation": "StateMachine uses RwLock to allow concurrent reads" + }, + "write_latency": { + "target": "< 10ms p99 for SET with Raft replication", + "breakdown": "1ms validation + 2ms serialization + 5ms network round-trip + 2ms apply", + "bottleneck": "Network latency for AppendEntries RPC to followers", + "mitigation": "Batching multiple proposals into single Raft entry (Phase 2 optimization)" + }, + "throughput": { + "target": "> 5,000 ops/sec per node", + "read_throughput": "Limited by CPU (HashMap lookups) and RwLock contention", + "write_throughput": "Limited by Raft consensus (can only commit as fast as majority responds)", + "scaling": "Phase 2 adds multi-shard parallelism for horizontal write scaling" + }, + "memory_usage": { + "state_machine": "HashMap size = total key-value data size (unbounded)", + "raft_log": "Grows until snapshot (max ~100MB before compaction)", + "operation_overhead": "Each Operation has ~48 bytes overhead (enum tag + Vec allocations)", + "mitigation": "Phase 1 adds log compaction and snapshots to bound memory growth" + }, + "zero_copy_opportunities": { + "bytes_crate": "Use bytes::Bytes for key/value data to avoid copies", + "limitation": "HashMap, Vec> currently requires owned data", + "future": "Phase 3 could use bytes::Bytes in StateMachine for zero-copy reads" + } + }, + "phase_1_limitations": { + "no_follower_reads": "Only leader serves reads in Phase 1. Phase 4 adds follower reads with ReadIndex.", + "no_batching": "Each SET/DEL is separate Raft proposal. Phase 2 adds batching for throughput.", + "no_pipelining": "Client waits for each operation to commit. Phase 3 adds request pipelining.", + "no_ttl": "No expiration support. Phase 3 adds TTL for keys.", + "simple_data_types": "Only byte arrays. Phase 3 adds Redis data types (lists, sets, hashes).", + "linearizable_reads": "Reads may return stale data during leadership transitions. Phase 4 adds ReadIndex mechanism." + }, + "implementation_dependencies": { + "requires": [ + "RaftNode::propose() - DONE (implemented in node.rs)", + "RaftNode::get() - DONE (implemented in node.rs)", + "RaftNode::is_leader() - DONE (implemented in node.rs)", + "RaftNode::leader_id() - DONE (implemented in node.rs)", + "Operation::serialize() - DONE (implemented in operations.rs)", + "Operation::deserialize() - DONE (implemented in operations.rs)", + "RespCommand and RespValue - DONE (protocol-resp crate complete)" + ], + "blockers": [ + "None - all required Raft functionality is implemented", + "Can begin KvService implementation immediately" + ], + "provides": [ + "KvService struct for seshat binary integration", + "Complete Redis command handlers (GET, SET, DEL, EXISTS, PING)", + "Input validation and error handling layer", + "RESP protocol integration with Raft consensus" + ] + }, + "observability": { + "structured_logging": { + "framework": "tracing crate with tracing::instrument on all handler methods", + "log_levels": [ + "ERROR: Raft errors, serialization failures, invariant violations", + "WARN: Leadership transitions, quorum loss, validation rejections", + "INFO: Command handling, leader redirection", + "DEBUG: Key/value sizes, operation details", + "TRACE: Full serialized operation bytes, detailed Raft interactions" + ], + "context": "Include node_id, command_type, key_size, value_size in all logs" + }, + "metrics": { + "phase_4": "OpenTelemetry metrics for production (not Phase 1)", + "key_metrics": [ + "kv_commands_total{command, status} - Counter of all commands", + "kv_command_duration_seconds{command} - Histogram of latencies", + "kv_validation_errors_total{error_type} - Counter of validation failures", + "kv_raft_proposals_total{status} - Counter of Raft proposals (success/failure)", + "kv_not_leader_redirects_total - Counter of MOVED responses", + "kv_key_size_bytes - Histogram of key sizes", + "kv_value_size_bytes - Histogram of value sizes" + ] + } + } + }, + "implementation_notes": { + "phase": "Phase 1 - MVP", + "status": "Partial - Operation enum exists, KvService handlers not yet implemented", + "next_steps": [ + "Implement KvService struct with Arc field", + "Implement handle_get with leader check and state machine read", + "Implement handle_set with size validation and Raft proposal", + "Implement handle_del with Raft proposal", + "Implement handle_exists with state machine reads", + "Implement handle_ping (trivial, no Raft interaction)", + "Add unit tests for each handler", + "Integration test with single-node Raft cluster" + ], + "future_enhancements": [ + "Phase 2: Multi-shard routing (consistent hashing)", + "Phase 2: Cross-shard operations", + "Phase 3: TTL support (expiring keys)", + "Phase 3: Additional Redis data types (lists, sets, hashes)", + "Phase 4: Read-your-writes consistency for followers", + "Phase 4: Follower reads with linearizability checks" + ] + } +} diff --git a/docs/specs/kvservice/design.md b/docs/specs/kvservice/design.md new file mode 100644 index 0000000..7cd66b3 --- /dev/null +++ b/docs/specs/kvservice/design.md @@ -0,0 +1,1419 @@ +# Technical Design: KV Service + +## Overview + +The KV service layer is a critical component of Seshat that bridges the RESP protocol layer with the Raft consensus system. It receives parsed RESP commands, translates them into Raft proposals or local reads, implements Redis command semantics, validates inputs, and handles leader redirection. + +**Core Principle**: All state changes go through Raft consensus. KvService never directly accesses storage - it always routes through RaftNode. + +## Dependencies + +**⚠️ CRITICAL BLOCKER**: This specification requires the **OpenRaft migration** (see `docs/specs/openraft/`) to be **100% complete** before KvService implementation can begin. The design uses async/await patterns that depend on OpenRaft's async APIs. + +**Implementation Order**: +1. **Complete OpenRaft migration first** (all 6 phases in `docs/specs/openraft/`) +2. Complete RocksDB storage integration (see `docs/specs/rocksdb/`) +3. Then implement KvService with async handlers + +**Required OpenRaft Changes**: +- `raft_node.propose(data).await` - **NOW ASYNC**: Returns `Result` +- `raft_node.get(key)` - **STILL SYNC**: Direct StateMachine access (NO internal leadership check) +- `raft_node.is_leader().await` - **NOW ASYNC**: Must be called before reads +- `raft_node.leader_id().await` - **NOW ASYNC**: For MOVED error responses +- **Error types**: Changed from raft-rs to openraft errors +- **Serialization**: Changed from bincode to protobuf (prost) + +These dependencies exist because: +- OpenRaft provides async APIs (`propose().await`) that KvService requires +- RocksDB provides persistent storage that StateMachine requires +- KvService cannot function without both consensus (OpenRaft) and persistence (RocksDB) + +## Architecture Pattern + +### Distributed Systems Service Layer + +Unlike traditional web applications that follow the Router → Service → Repository → Entity → Database pattern, KvService is a **distributed systems component** that integrates with Raft consensus for strong consistency guarantees. + +```mermaid +graph TD + Client[TCP Client
Port 6379] --> RespCodec[RESP Protocol
Parser/Encoder] + RespCodec --> KvService[KV Service
Command Handlers] + KvService --> RaftNode[Raft Node
Consensus Layer] + RaftNode --> StateMachine[State Machine
HashMap] + RaftNode --> RaftLog[Raft Log
RocksDB] + RaftNode -->|AppendEntries
gRPC| Follower1[Follower Node 2] + RaftNode -->|AppendEntries
gRPC| Follower2[Follower Node 3] +``` + +### Data Flow Patterns + +**Write Operations**: +``` +KvService validates → serializes Operation → RaftNode::propose() → +Raft log replication → majority commit → StateMachine::apply() → return success +``` + +**Read Operations**: +``` +KvService checks leadership → RaftNode::get() → StateMachine HashMap → return value +``` + +**Leadership Routing**: +Followers redirect writes to leader via MOVED error. Leaders serve both reads and writes. + +## Domain Model + +### KvService Struct + +```rust +pub struct KvService { + raft_node: Arc +} +``` + +**Ownership**: Owns `Arc` for thread-safe access across async tasks + +**Responsibility**: Command routing, input validation, RESP response formatting + +**No State**: KvService is stateless - all state lives in RaftNode's StateMachine + +**Thread Safety**: Clone-able via Arc, safe to use across tokio tasks + +### Operation Enum + +**Location**: `crates/kv/src/operations.rs` (already implemented) + +```rust +use prost::Message; + +#[derive(Message)] +pub enum Operation { + /// Insert or update key-value pair + Set { key: Vec, value: Vec }, + /// Delete key from state machine + Del { key: Vec } +} + +impl Operation { + pub fn apply(&self, state: &mut HashMap, Vec>) -> OperationResult> { + // Apply operation to state machine + } + + pub fn encode_to_vec(&self) -> Vec { + // Protobuf serialization + } + + pub fn decode(data: &[u8]) -> Result { + // Protobuf deserialization + } +} +``` + +**Serialization**: Uses **protobuf (prost)** for Raft log entries (replaces bincode) + +**Rationale for Protobuf**: +- Schema evolution support (forward/backward compatibility) +- Consistent with storage layer and network layer +- Better cross-language compatibility +- Aligns with OpenRaft's recommended approach + +**Idempotency**: +- SET is idempotent (last write wins) +- DEL is idempotent (returns 0 if key doesn't exist) + +### StateMachine + +**Location**: `crates/raft/src/state_machine.rs` (implemented) + +```rust +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct StateMachine { + /// The key-value data store + data: HashMap, Vec>, + /// The last applied log index + last_applied: u64, +} +``` + +**Key Methods**: +- `get(&self, key: &[u8]) -> Option>` - Read-only lookup, returns cloned value +- `exists(&self, key: &[u8]) -> bool` - Check key existence without cloning value +- `apply(&mut self, index: u64, data: &[u8]) -> Result>` - Apply log entry +- `snapshot(&self) -> Result>` - Serialize entire state for snapshots +- `restore(&mut self, snapshot: &[u8]) -> Result<()>` - Restore from snapshot +- `last_applied(&self) -> u64` - Get highest applied log index + +**Storage**: In-memory `HashMap, Vec>` for fast lookups + +**Thread Safety**: +- StateMachine itself is NOT thread-safe (no internal locking) +- RaftNode wraps it with appropriate synchronization (RwLock or Mutex) +- Multiple concurrent reads are safe via RaftNode's locking +- Writes are serialized through Raft consensus + +**Apply Logic**: +1. Idempotency check - rejects if `index <= last_applied` +2. Deserializes Operation from bytes +3. Calls `Operation::apply()` on internal HashMap +4. Updates `last_applied` to track progress +5. Returns operation result bytes + +**Last Applied**: Prevents duplicate application and ensures exactly-once semantics + +**Persistence**: State machine is rebuilt from Raft log on restart (or restored from snapshot) + +### RaftNode + +**Location**: `crates/raft/src/node.rs` (implemented with OpenRaft) + +**Key Methods** (After OpenRaft Migration): +- `async fn propose(&self, data: Vec) -> Result` - **ASYNC** - Submit write to Raft, waits for commit +- `fn get(&self, key: &[u8]) -> Option>` - **SYNC** - Direct StateMachine access (**WARNING: Does NOT check leadership internally**) +- `async fn is_leader(&self) -> bool` - **ASYNC** - Check if this node is leader +- `async fn leader_id(&self) -> Option` - **ASYNC** - Get current leader ID for redirection + +**Consensus**: Wraps openraft `Raft`, handles leader election, log replication, commit tracking + +**gRPC**: Integrated gRPC transport for AppendEntries, RequestVote, InstallSnapshot messages + +### OpenRaft API Changes + +**IMPORTANT**: After OpenRaft migration, RaftNode uses openraft's async API. + +**API Changes from raft-rs**: +```rust +// OLD (raft-rs - synchronous): +impl RaftNode { + fn propose(&mut self, data: Vec) -> Result<()> // Sync + fn get(&self, key: &[u8]) -> Option> // Sync + fn is_leader(&self) -> bool // Sync + fn leader_id(&self) -> Option // Sync +} + +// NEW (openraft - asynchronous): +impl RaftNode { + async fn propose(&self, data: Vec) -> Result // Async! + fn get(&self, key: &[u8]) -> Option> // Still sync (direct StateMachine access) + async fn is_leader(&self) -> bool // Async! + async fn leader_id(&self) -> Option // Async! +} +``` + +**Critical Leadership Check Contract**: +- `RaftNode::get()` does NOT check leadership internally (direct HashMap access) +- KvService MUST call `is_leader().await` before calling `get()` +- Without this check, followers will serve stale reads +- This is intentional in Phase 1 for simplicity (Phase 4 adds ReadIndex for linearizable reads) + +**Error Type Changes**: +- raft-rs errors → openraft errors +- Different error variants and semantics +- KvServiceError must map from openraft error types + +## API Specification + +### Type +Internal Rust async API (not HTTP REST or gRPC) + +### Interface +KvService methods are async and return `Result` + +### Command Handlers + +#### handle_get + +```rust +async fn handle_get(&self, key: Vec) -> Result +``` + +**Behavior**: +1. **MUST** check if leader via `raft_node.is_leader().await` for consistency +2. If not leader, return `NotLeader` error +3. If leader, call `raft_node.get(&key)` (sync - direct StateMachine HashMap access) +4. Return `RespValue::BulkString(value)` or `RespValue::Null` + +**Implementation**: +```rust +async fn handle_get(&self, key: Vec) -> Result { + // MUST check leadership first (get() doesn't check internally) + if !self.raft_node.is_leader().await { // <-- .await added + let leader_id = self.raft_node.leader_id().await; // <-- .await added + return Err(KvServiceError::NotLeader { leader_id }); + } + + // get() is sync (direct StateMachine access) + match self.raft_node.get(&key) { + Some(value) => Ok(RespValue::BulkString(value.into())), + None => Ok(RespValue::Null), + } +} +``` + +**Read Consistency Model - Leadership Transition Race**: +- Reads may be stale during leadership transitions (window of ~100ms) +- While KvService checks `is_leader().await` before reads, leadership can change before `get()` completes +- `is_leader()` is async (checks openraft state) +- `get()` is sync (direct StateMachine HashMap access, no internal leadership check) +- **Race window**: Between `is_leader().await` returning true and `get()` executing +- Old leader may serve reads briefly after losing leadership +- New leader may not have all committed entries immediately after election +- Clients may observe "time travel": newer value → older value → newer value +- Phase 4 will add linearizable reads via ReadIndex to eliminate this race + +**Errors**: `NotLeader` if not leader (prevents most stale reads, but race condition still exists) + +**Latency Target**: < 5ms p99 (in-memory HashMap lookup + leadership check) + +#### handle_set + +```rust +async fn handle_set(&self, key: Vec, value: Vec) -> Result +``` + +**Behavior**: +1. Validate key size <= 256 bytes +2. Validate value size <= 64KB +3. Create `Operation::Set { key, value }` +4. Serialize with protobuf: `operation.encode_to_vec()` +5. Call `raft_node.propose(serialized_op).await` (async/await - waits for commit) +6. Return `RespValue::Ok` on success + +**Implementation**: +```rust +async fn handle_set(&self, key: Vec, value: Vec) -> Result { + // Validation + validate_key_size(&key)?; + validate_value_size(&value)?; + + // Serialize operation with protobuf + let operation = Operation::Set { key, value }; + use prost::Message; + let data = operation.encode_to_vec(); + + // Propose via Raft (async with openraft) + self.raft_node.propose(data).await?; // <-- .await added + + Ok(RespValue::SimpleString("OK".into())) +} +``` + +**Errors**: `KeyTooLarge`, `ValueTooLarge`, `NotLeader`, `NoQuorum`, `ProposalFailed` + +**Latency Target**: < 10ms p99 (includes Raft replication to 2/3 nodes) + +#### handle_del + +```rust +async fn handle_del(&self, keys: Vec>) -> Result +``` + +**Behavior**: +1. Validate each key size <= 256 bytes +2. For each key: + - Create `Operation::Del { key }` + - Serialize with protobuf: `operation.encode_to_vec()` + - Call `raft_node.propose(serialized_op).await` (async/await - waits for commit) + - Track success/failure +3. Accumulate deleted count from successful proposals +4. Return `RespValue::Integer(deleted_count)` + +**Implementation**: +```rust +async fn handle_del(&self, keys: Vec>) -> Result { + // Validation + for key in &keys { + validate_key_size(key)?; + } + + let mut deleted_count = 0; + + // Each key is a separate Raft proposal + for key in keys { + let operation = Operation::Del { key }; + use prost::Message; + let data = operation.encode_to_vec(); + + // Propose (async) + match self.raft_node.propose(data).await { // <-- .await added + Ok(response) => { + // Parse response to get deletion count (1 or 0) + deleted_count += parse_del_result(&response)?; + } + Err(e) => { + // Partial failure - return what we deleted so far with error + return Err(KvServiceError::PartialFailure { + completed: deleted_count, + error: Box::new(e), + }); + } + } + } + + Ok(RespValue::Integer(deleted_count)) +} +``` + +**Multi-Key Atomicity Semantics**: + +**Approach**: Each key is a **separate Raft proposal** (not batched into a single operation) + +**Rationale**: +- **Matches Redis behavior**: Redis DEL is NOT atomic across multiple keys - each key deletion is independent +- **Allows partial success**: If key1 deletes successfully but key2 fails (e.g., quorum loss), DEL returns accurate count +- **Simpler implementation**: Reuses existing `Operation::Del { key }` without adding `Operation::DelMulti` variant +- **Better fault tolerance**: Failures on one key don't prevent other keys from being deleted +- **Phase 1 simplicity**: Batched operations are a Phase 2 optimization + +**Partial Failure Handling**: +- If a proposal fails (NotLeader, NoQuorum, timeout), stop processing remaining keys +- Return the count of successfully deleted keys up to that point +- Client can detect partial failure if returned count < requested key count +- Client is responsible for retry logic on partial failures + +**Deleted Count Tracking**: +- Each `Operation::Del::apply()` returns `b"1"` if key existed, `b"0"` if not +- After each successful proposal/commit, parse the result bytes and increment counter +- Non-existent keys contribute 0 to the count (matches Redis semantics) +- Final count = sum of all successfully applied deletions + +**Performance Trade-off**: +- **Slower than batching**: N keys require N Raft consensus rounds instead of 1 +- **Acceptable for Phase 1**: Simplicity and learning objectives outweigh throughput +- **Future optimization**: Phase 2 can add `Operation::DelMulti` with batched replication + +**Errors**: +- `KeyTooLarge` - If any key exceeds 256 bytes (validation happens before proposals) +- `NotLeader` - If not leader when first proposal attempted +- `NoQuorum` - If cannot reach majority during any proposal +- `ProposalFailed` - If Raft rejects proposal for any key + +**Error Handling During Multi-Key**: +- **Early validation failure**: Return error immediately without proposing any keys +- **First proposal fails**: Return error with deleted_count = 0 +- **Mid-sequence failure**: Return error with partial deleted_count (only successfully committed deletions) +- **Leadership change mid-operation**: New proposals fail with NotLeader, return partial count + +**Redis Semantics Match**: +- DEL accepts multiple keys, returns count of deleted keys (0 if none existed) +- Non-atomic behavior matches Redis Cluster semantics (keys may be on different nodes/shards) +- Idempotent: Deleting non-existent key returns 0, doesn't fail + +**Latency Target**: < 10ms p99 per key (each key requires Raft replication) + +#### handle_exists + +```rust +async fn handle_exists(&self, keys: Vec>) -> Result +``` + +**Behavior**: +1. **MUST** check if leader via `raft_node.is_leader().await` for consistency +2. If not leader, return `NotLeader` error +3. For each key, call `raft_node.get(key)` (sync) +4. Count how many exist +5. Return `RespValue::Integer(exists_count)` + +**Implementation**: +```rust +async fn handle_exists(&self, keys: Vec>) -> Result { + // MUST check leadership (get() doesn't check internally) + if !self.raft_node.is_leader().await { // <-- .await added + let leader_id = self.raft_node.leader_id().await; // <-- .await added + return Err(KvServiceError::NotLeader { leader_id }); + } + + let mut exists_count = 0; + for key in &keys { + if self.raft_node.get(key).is_some() { + exists_count += 1; + } + } + + Ok(RespValue::Integer(exists_count)) +} +``` + +**Errors**: `NotLeader` if not leader (prevents most stale reads, but race condition still exists) + +**Redis Semantics**: EXISTS accepts multiple keys, returns count of existing keys + +#### handle_ping + +```rust +async fn handle_ping(&self, message: Option>) -> Result +``` + +**Behavior**: +1. If message is `Some(msg)`, return `RespValue::BulkString(msg)` +2. If None, return `RespValue::SimpleString("PONG")` + +**Implementation**: +```rust +async fn handle_ping(&self, message: Option>) -> Result { + // No async needed - no Raft interaction + match message { + Some(msg) => Ok(RespValue::BulkString(msg.into())), + None => Ok(RespValue::SimpleString("PONG".into())), + } +} +``` + +**Errors**: None (no Raft interaction) + +**Redis Semantics**: PING without args returns PONG. PING echoes message. + +## Component Structure + +### KvService Module + +**File**: `crates/kv/src/service.rs` (new file to create) + +**Exports**: `pub struct KvService`, `pub enum KvServiceError` + +**Dependencies**: +```rust +use crate::Operation; // from operations.rs +use seshat_raft::RaftNode; // from raft crate +use seshat_protocol_resp::{RespCommand, RespValue}; // from protocol-resp crate +use std::sync::Arc; // for thread-safe RaftNode sharing +use thiserror::Error; // for error type definitions +``` + +### Error Types + +```rust +#[derive(Debug, Error)] +pub enum KvServiceError { + /// Key exceeds 256 bytes + #[error("key too large: {size} bytes (max 256)")] + KeyTooLarge { size: usize }, + + /// Value exceeds 64KB + #[error("value too large: {size} bytes (max 64KB)")] + ValueTooLarge { size: usize }, + + /// Write on follower, redirect to leader + #[error("not leader, redirect to node {leader_id:?}")] + NotLeader { leader_id: Option }, + + /// Cannot reach Raft majority + #[error("no quorum available")] + NoQuorum, + + /// Raft proposal rejected + #[error("proposal failed: {0}")] + ProposalFailed(String), + + /// Operation serialization failed (protobuf) + #[error("serialization error: {0}")] + SerializationError(#[from] prost::DecodeError), + + /// OpenRaft error (replaces RaftError) + #[error("raft error: {0}")] + OpenRaftError(Box), + + /// Partial multi-key operation failure (for DEL with multiple keys) + #[error("partial failure: completed {completed} operations before error")] + PartialFailure { + completed: i64, + error: Box, + }, +} + +// Implement From for openraft errors +impl From> for KvServiceError { + fn from(err: openraft::error::ClientWriteError) -> Self { + match err { + openraft::error::ClientWriteError::ForwardToLeader(fwd) => { + KvServiceError::NotLeader { + leader_id: fwd.leader_id, + } + } + openraft::error::ClientWriteError::ChangeMembershipError(_) => { + KvServiceError::OpenRaftError(Box::new(err)) + } + _ => KvServiceError::OpenRaftError(Box::new(err)), + } + } +} +``` + +**Key Changes**: +- `SerializationError` now uses `prost::DecodeError` instead of `bincode::Error` +- `RaftError` renamed to `OpenRaftError` with updated type signature +- Added `PartialFailure` variant for multi-key DEL operations +- Added `From` implementation for openraft's `ClientWriteError` + +**to_resp_value()**: Implementation method to convert `KvServiceError` to `RespValue::Error` for client responses + +### Validation Logic + +```rust +const MAX_KEY_SIZE: usize = 256; // bytes +const MAX_VALUE_SIZE: usize = 65536; // 64 KB + +fn validate_key_size(key: &[u8]) -> Result<(), KvServiceError> { + if key.len() > MAX_KEY_SIZE { + Err(KvServiceError::KeyTooLarge { size: key.len() }) + } else { + Ok(()) + } +} + +fn validate_value_size(value: &[u8]) -> Result<(), KvServiceError> { + if value.len() > MAX_VALUE_SIZE { + Err(KvServiceError::ValueTooLarge { size: value.len() }) + } else { + Ok(()) + } +} +``` + +**Early Validation**: Validate BEFORE creating Operation and proposing to Raft to prevent resource exhaustion + +## Seshat Binary Integration + +This section shows how KvService integrates with the seshat binary (`crates/seshat/src/main.rs`) to create a complete distributed key-value store node. + +### Component Initialization + +```rust +// crates/seshat/src/main.rs - Seshat binary startup + +use seshat_kv::KvService; +use seshat_raft::RaftNode; +use seshat_protocol_resp::{RespCodec, RespCommand, RespValue}; +use tokio::net::{TcpListener, TcpStream}; +use tokio_util::codec::{Decoder, Framed}; +use futures::StreamExt; +use std::sync::Arc; +use tracing::{info, error}; + +#[tokio::main] +async fn main() -> Result<()> { + // 1. Initialize tracing/logging + tracing_subscriber::fmt::init(); + + // 2. Parse configuration (node ID, cluster peers, storage path) + let config = Config::from_args()?; + info!("Starting Seshat node {}", config.node_id); + + // 3. Initialize Raft consensus layer + let raft_node = Arc::new(RaftNode::new(config.clone())?); + info!("Raft node initialized"); + + // 4. Initialize KV service layer with shared RaftNode + let kv_service = KvService::new(raft_node.clone()); + info!("KV service initialized"); + + // 5. Start Raft consensus background tasks + tokio::spawn({ + let raft_node = raft_node.clone(); + async move { + raft_node.run().await + } + }); + + // 6. Start TCP listener for Redis clients on port 6379 + let addr = format!("0.0.0.0:{}", config.redis_port); // default: 6379 + let listener = TcpListener::bind(&addr).await?; + info!("Listening for Redis clients on {}", addr); + + // 7. Accept client connections + loop { + match listener.accept().await { + Ok((stream, peer_addr)) => { + info!("New client connection from {}", peer_addr); + + // Spawn handler for each client connection + let kv_service = kv_service.clone(); // Clone Arc wrapper + tokio::spawn(async move { + if let Err(e) = handle_client(stream, kv_service).await { + error!("Client handler error: {}", e); + } + }); + } + Err(e) => { + error!("Failed to accept connection: {}", e); + } + } + } +} +``` + +### TCP Connection Handler + +```rust +/// Handle a single Redis client connection +async fn handle_client( + stream: TcpStream, + kv_service: KvService, +) -> Result<()> { + // 1. Wrap TCP stream with RespCodec for automatic framing + let mut framed = Framed::new(stream, RespCodec::new()); + + // 2. Process commands in a loop + while let Some(result) = framed.next().await { + match result { + Ok(resp_value) => { + // 3. Parse RESP value into command + let command = match RespCommand::from_value(resp_value) { + Ok(cmd) => cmd, + Err(e) => { + // Protocol error - send error response + let error = RespValue::Error( + Bytes::from(format!("ERR protocol error: {}", e)) + ); + framed.send(error).await?; + continue; + } + }; + + // 4. Route command to appropriate handler + let response = route_command(&kv_service, command).await; + + // 5. Send response back to client + framed.send(response).await?; + } + Err(e) => { + error!("RESP codec error: {}", e); + break; + } + } + } + + Ok(()) +} +``` + +### Command Routing + +```rust +/// Route parsed commands to KvService handlers +async fn route_command( + kv_service: &KvService, + command: RespCommand, +) -> RespValue { + // Convert RespCommand to appropriate handler call + let result = match command { + RespCommand::Get { key } => { + kv_service.handle_get(key.to_vec()).await + } + RespCommand::Set { key, value } => { + kv_service.handle_set(key.to_vec(), value.to_vec()).await + } + RespCommand::Del { keys } => { + let byte_keys: Vec> = keys.into_iter() + .map(|k| k.to_vec()) + .collect(); + kv_service.handle_del(byte_keys).await + } + RespCommand::Exists { keys } => { + let byte_keys: Vec> = keys.into_iter() + .map(|k| k.to_vec()) + .collect(); + kv_service.handle_exists(byte_keys).await + } + RespCommand::Ping { message } => { + kv_service.handle_ping(message.map(|m| m.to_vec())).await + } + }; + + // Convert Result to RespValue + match result { + Ok(value) => value, + Err(e) => e.to_resp_value(), // Convert error to RESP error response + } +} +``` + +### Data Flow Through Components + +**Complete request flow for `SET foo bar`**: + +``` +Step 1: redis-cli sends "SET foo bar" over TCP to port 6379 + Raw bytes: *3\r\n$3\r\nSET\r\n$3\r\nfoo\r\n$3\r\nbar\r\n + +Step 2: TcpListener::accept() receives connection, spawns handle_client() + +Step 3: Framed reads bytes from socket + +Step 4: RespCodec::decode() parses RESP protocol + → RespValue::Array([ + RespValue::BulkString("SET"), + RespValue::BulkString("foo"), + RespValue::BulkString("bar") + ]) + +Step 5: RespCommand::from_value() converts to typed command + → RespCommand::Set { key: "foo", value: "bar" } + +Step 6: route_command() extracts key/value, calls: + kv_service.handle_set(b"foo", b"bar") + +Step 7: KvService::handle_set() validates: + - Key size (3 bytes) <= 256 bytes ✓ + - Value size (3 bytes) <= 64KB ✓ + +Step 8: Creates Operation::Set { key: b"foo", value: b"bar" } + Serializes with bincode → Vec + +Step 9: Calls raft_node.propose(serialized_operation) + +Step 10: openraft checks leadership internally + - If follower → return ForwardToLeader error + - If leader → continue + +Step 11: openraft appends to local Raft log + +Step 12: openraft sends AppendEntries gRPC to followers (nodes 2, 3) + +Step 13: Followers append to logs, respond with success + +Step 14: Once majority (2/3) ack, openraft commits entry + +Step 15: openraft calls StateMachine::apply() + - Deserializes Operation::Set using protobuf + - Calls op.apply(&mut hashmap) + - HashMap inserts ("foo", "bar") + +Step 16: raft_node.propose().await returns ClientWriteResponse to KvService + +Step 17: KvService returns Ok(RespValue::SimpleString("+OK")) + +Step 18: route_command() returns RespValue to handle_client() + +Step 19: RespCodec::encode() serializes to: +OK\r\n + +Step 20: Framed::send() writes bytes to TcpStream + +Step 21: redis-cli receives: OK +``` + +**Complete request flow for `GET foo`**: + +``` +Step 1: redis-cli sends "GET foo" over TCP + +Step 2-6: Same parsing flow as SET + +Step 7: route_command() calls: + kv_service.handle_get(b"foo") + +Step 8: KvService::handle_get() MUST check raft_node.is_leader().await // <-- async + - If follower → return NotLeader { leader_id: Some(1) } + - If leader → continue + +Step 9: Calls raft_node.get(b"foo") // <-- sync (direct HashMap access) + +Step 10: RaftNode reads from StateMachine HashMap (NO internal leadership check) + - No Raft consensus needed (local read) + - Returns Some(b"bar") or None + +Step 11: KvService returns: + - Ok(RespValue::BulkString(b"bar")) if found + - Ok(RespValue::Null) if not found + +Step 12: RespCodec encodes to: $3\r\nbar\r\n or $-1\r\n + +Step 13: redis-cli receives: "bar" or (nil) + +**IMPORTANT**: Race condition exists between Step 8 (is_leader() check) and Step 9 (get() call) - leadership may change in ~100ms window. +``` + +### Leader Redirection Flow + +**When client sends SET to follower node**: + +``` +Step 1-9: Same flow until raft_node.propose() + +Step 10: openraft checks leadership internally → not leader + Returns Err(ForwardToLeader) + +Step 11: KvService receives error, calls raft_node.leader_id().await // <-- async + Returns Some(1) (current leader is node 1) + +Step 12: KvService returns: + Err(KvServiceError::NotLeader { leader_id: Some(1) }) + +Step 13: route_command() calls error.to_resp_value() + → RespValue::Error("-MOVED 1\r\n") + +Step 14: RespCodec encodes: -MOVED 1\r\n + +Step 15: redis-cli receives error, user reconnects to node 1 +``` + +### Graceful Shutdown + +```rust +// In main.rs - handling shutdown signals + +use tokio::signal; + +async fn shutdown_signal() { + // Wait for Ctrl+C + signal::ctrl_c().await.expect("Failed to install CTRL+C handler"); + info!("Shutdown signal received"); +} + +#[tokio::main] +async fn main() -> Result<()> { + // ... initialization code ... + + tokio::select! { + // Accept client connections + _ = accept_loop(listener, kv_service) => { + error!("Accept loop terminated unexpectedly"); + } + // Wait for shutdown signal + _ = shutdown_signal() => { + info!("Shutting down gracefully"); + + // 1. Stop accepting new connections (drop listener) + // 2. Wait for active connections to drain + // 3. Stop Raft background tasks + // 4. Flush Raft state to disk + // 5. Close RocksDB storage + } + } + + Ok(()) +} +``` + +### Thread Safety Model + +**Arc Reference Counting**: +- `Arc` allows multiple async tasks to share RaftNode +- Each client connection handler gets a clone of `Arc` +- RaftNode is dropped when all Arc references are released + +**KvService Cloning**: +- KvService wraps `Arc`, is `Clone`-able +- Cloning is cheap (just increments Arc refcount) +- Each client handler can own its own KvService instance + +**StateMachine Locking** (inside RaftNode): +- StateMachine is wrapped in `RwLock` or `Mutex` +- Multiple concurrent reads via `RwLock::read()` +- Exclusive writes via `RwLock::write()` during Raft apply + +**Tokio Runtime**: +- All I/O operations are async (no blocking) +- Tokio handles task scheduling and concurrency +- No manual thread management needed + +### Performance Considerations + +**Connection Pooling**: +- Each redis-cli connection is a separate TCP stream +- Framed + RespCodec handles buffering automatically +- No explicit connection pooling needed (tokio handles it) + +**Batching Opportunities** (Phase 2): +- Multiple SET commands could be batched into single Raft proposal +- Requires request queue and batch timeout +- Not implemented in Phase 1 MVP + +**Zero-Copy Parsing**: +- RespCodec uses `bytes::Bytes` for zero-copy buffer management +- BytesMut allows in-place parsing without allocations +- Bytes::clone() is cheap (refcount increment, not data copy) + +### Configuration Example + +```rust +// Configuration structure for seshat binary +pub struct Config { + /// This node's ID (1, 2, or 3) + pub node_id: u64, + + /// Redis client port (default: 6379) + pub redis_port: u16, + + /// Raft internal RPC port (default: 7379) + pub raft_port: u16, + + /// Cluster peers [(node_id, addr)] + pub peers: Vec<(u64, String)>, + + /// RocksDB storage path + pub storage_path: PathBuf, +} + +// Example usage: +// Node 1: seshat --node-id 1 --peers "2:localhost:7380,3:localhost:7381" +// Node 2: seshat --node-id 2 --peers "1:localhost:7379,3:localhost:7381" +// Node 3: seshat --node-id 3 --peers "1:localhost:7379,2:localhost:7380" +``` + +### Error Handling in Binary + +**Protocol Errors**: +- Invalid RESP syntax → RespCodec returns Err +- Binary sends `-ERR protocol error` back to client +- Connection remains open (doesn't disconnect) + +**Command Errors**: +- Unknown command → RespCommand::from_value() returns UnknownCommand +- Binary sends `-ERR unknown command` back to client + +**KvService Errors**: +- NotLeader → `-MOVED ` +- KeyTooLarge → `-ERR key too large` +- NoQuorum → `-NOQUORUM cannot reach majority` + +**Fatal Errors**: +- RocksDB corruption → panic and restart node +- Raft invariant violation → panic and restart node +- OOM → let OS kill process, supervisor restarts + +## Data Flow + +### Write Path (SET Command) + +``` +Step 1: TCP Client sends: SET foo bar +Step 2: seshat binary TcpListener receives on port 6379 +Step 3: RespCodec parses into RespCommand::Set{key: "foo", value: "bar"} +Step 4: seshat binary routes to KvService::handle_set(b"foo", b"bar") +Step 5: KvService validates key (3 bytes) <= 256 bytes ✓ +Step 6: KvService validates value (3 bytes) <= 64KB ✓ +Step 7: KvService creates Operation::Set { key: b"foo", value: b"bar" } +Step 8: KvService serializes Operation with protobuf: operation.encode_to_vec() +Step 9: KvService calls raft_node.propose(serialized_op).await // <-- async +Step 10: openraft checks leadership internally. If follower, return ForwardToLeader error → + KvService converts to NotLeader, returns MOVED +Step 11: openraft (leader) appends entry to local Raft log +Step 12: openraft sends AppendEntries RPC via gRPC to followers (node 2, node 3) +Step 13: Followers append entry to their logs and respond with success +Step 14: Once majority (2/3) respond, openraft commits the entry +Step 15: openraft calls StateMachine::apply(index, serialized_op) +Step 16: StateMachine deserializes Operation::Set using protobuf +Step 17: StateMachine calls op.apply(&mut hashmap) → inserts ("foo", "bar") +Step 18: openraft signals commit success to waiting propose().await call +Step 19: KvService receives ClientWriteResponse, returns RespValue::Ok +Step 20: RespCodec encodes "+OK\r\n" and sends to client +``` + +### Read Path (GET Command) + +``` +Step 1: TCP Client sends: GET foo +Step 2: seshat binary TcpListener receives on port 6379 +Step 3: RespCodec parses into RespCommand::Get{key: "foo"} +Step 4: seshat binary routes to KvService::handle_get(b"foo") +Step 5: KvService MUST check raft_node.is_leader().await // <-- async +Step 6: If not leader → return NotLeader error → client gets MOVED response +Step 7: If leader → call raft_node.get(b"foo") // <-- sync (direct HashMap) +Step 8: RaftNode reads from StateMachine HashMap (NO additional leadership check) +Step 9: StateMachine returns Some(b"bar") if key exists, None otherwise +Step 10: KvService converts to RespValue::BulkString(b"bar") or RespValue::Null +Step 11: RespCodec encodes "$3\r\nbar\r\n" or "$-1\r\n" and sends to client +``` + +**Read Consistency Implementation Details**: + +1. **KvService's Responsibility**: + - MUST call `is_leader().await` before `get()` to enforce leader-only reads + - `is_leader()` is async (checks openraft state) + - `RaftNode::get()` itself is sync and does NOT verify leadership internally + - Without the `is_leader()` check, any node (including followers) can serve potentially stale reads + +2. **Race Condition Window**: + - Leadership may change between Step 5 (`is_leader().await` check) and Step 7 (`get()` call) + - Window is typically ~100ms (one election timeout) + - `is_leader()` is async (involves checking openraft's internal state) + - `get()` is sync (direct HashMap access with no leadership verification) + - During this window, an old leader may serve stale reads after losing leadership + - New leader may not have all committed entries yet + +3. **Phase 1 Trade-off**: + - Accepts potential staleness during leadership transitions for simplicity + - Provides "mostly consistent" reads (stale only during rare leader changes) + - Good enough for Phase 1 MVP and learning objectives + +4. **Phase 4 Improvement**: + - Will implement ReadIndex mechanism for linearizable reads + - Leader will verify it's still leader before serving reads + - Eliminates race condition window via Raft heartbeat round-trip + +### Multi-Key DEL Path + +``` +Step 1: TCP Client sends: DEL key1 key2 key3 +Step 2: seshat binary TcpListener receives on port 6379 +Step 3: RespCodec parses into RespCommand::Del{keys: [b"key1", b"key2", b"key3"]} +Step 4: seshat binary routes to KvService::handle_del(vec![b"key1", b"key2", b"key3"]) +Step 5: KvService validates all key sizes <= 256 bytes +Step 6: deleted_count = 0 + +For each key (key1, key2, key3): + Step 7: Create Operation::Del { key } + Step 8: Serialize with protobuf: operation.encode_to_vec() + Step 9: Call raft_node.propose(serialized_op).await // <-- async + Step 10: openraft checks leadership internally. If follower, return ForwardToLeader → + Return RespValue::Integer(deleted_count) with partial count + Step 11: openraft appends entry to log, replicates via AppendEntries + Step 12: Wait for majority commit (async) + Step 13: StateMachine::apply() deserializes Operation::Del using protobuf + Step 14: StateMachine calls op.apply(&mut hashmap) → returns b"1" or b"0" + Step 15: Parse result, add to deleted_count (1 if existed, 0 if not) + +Step 16: Return RespValue::Integer(deleted_count) with total +Step 17: RespCodec encodes ":3\r\n" (if all 3 keys existed) +Step 18: Send to client +``` + +**Partial Failure Example**: +``` +DEL key1 key2 key3 +- key1: propose → commit → deleted (count = 1) +- key2: propose → commit → deleted (count = 2) +- key3: propose → NoQuorum error +- Return: RespValue::Integer(2) with NoQuorum error +- Client sees: Only 2 keys deleted, must retry key3 +``` + +### Follower Redirect + +When a client sends a write command to a follower node: + +``` +Step 1: Client sends SET to follower node +Step 2: KvService::handle_set validates and creates Operation +Step 3: Calls raft_node.propose(data).await // <-- async +Step 4: openraft checks leadership internally → not leader +Step 5: openraft returns ForwardToLeader error +Step 6: KvService calls raft_node.leader_id().await → Some(1) // <-- async +Step 7: KvService returns KvServiceError::NotLeader { leader_id: Some(1) } +Step 8: Error converted to RespValue::Error("-MOVED 1\r\n") +Step 9: Client receives error, reconnects to leader node 1, retries SET +``` + +## Error Handling + +### Validation Errors + +**Key Too Large**: +- Return `RespValue::Error("ERR key too large (max 256 bytes)")` immediately +- No Raft interaction +- Validated before creating Operation + +**Value Too Large**: +- Return `RespValue::Error("ERR value too large (max 64KB)")` immediately +- No Raft interaction +- Validated before calling `raft_node.propose()` + +### Leadership Errors + +**Not Leader (Known Leader)**: +- Return `RespValue::Error("-MOVED \r\n")` so client can redirect +- Occurs when follower receives write command +- Client should reconnect to leader + +**Not Leader (No Leader)**: +- Return `RespValue::Error("-NOLEADER election in progress\r\n")` so client can retry +- Occurs during leader election +- Client should wait and retry + +### Quorum Errors + +**No Quorum**: +- Return `RespValue::Error("-NOQUORUM cannot reach majority\r\n")` +- Occurs when Raft majority is unreachable (network partition, node failures) +- Client should retry after partition heals + +**Timeout**: +- If commit doesn't happen within 30 seconds (request_timeout_secs), return timeout error +- Indicates network issues or slow followers + +### Serialization Errors + +**Bincode Error**: +- Return `RespValue::Error("ERR internal serialization error")` +- Should never happen with valid Operation (defensive coding) +- Log error with `tracing::error!` + +### Raft Internal Errors + +**Handling**: Log error with `tracing::error!`, return `RespValue::Error("ERR internal Raft error")` + +**Examples**: Storage corruption, invalid log entries, state machine inconsistencies + +**Recovery**: Node should crash and restart (fail-fast principle) on invariant violations + +## Concurrency Model + +**Async Runtime**: Tokio 1.x for all I/O operations + +**OpenRaft Async Integration**: +- All RaftNode methods that interact with openraft are async +- `get()` remains sync (direct StateMachine access with RwLock) +- KvService handlers await on `propose()`, `is_leader()`, `leader_id()` +- No blocking operations in async contexts + +**KvService Cloning**: KvService wraps `Arc`, is Clone-able, safe to share across tokio tasks + +**Raft Node Sharing**: `Arc` provides thread-safe access to Raft state + +**State Machine Locking**: StateMachine uses RwLock for concurrent reads, exclusive writes on apply + +**Propose Concurrency**: Multiple concurrent `propose().await` calls are safe - openraft serializes them into log order + +**Read Concurrency**: Reads are concurrent-safe via RwLock read locks on StateMachine HashMap + +**No Blocking**: All KvService methods are async, never block tokio runtime + +**Channel Communication**: RaftNode uses `tokio::sync::mpsc` channels for internal message passing + +## Testing Strategy + +### Unit Tests + +**File**: `crates/kv/src/service.rs` (in `#[cfg(test)] mod tests`) + +**Approach**: Mock RaftNode using trait-based dependency injection or test doubles + +**All tests use #[tokio::test]** for async support: + +```rust +// Example async test structure +#[tokio::test] +async fn test_handle_set_validates_key_size() { + let kv_service = setup_test_service().await; + let result = kv_service.handle_set(vec![0u8; 257], vec![1, 2, 3]).await; + assert!(matches!(result, Err(KvServiceError::KeyTooLarge { .. }))); +} + +#[tokio::test] +async fn test_handle_get_on_follower_returns_not_leader() { + let kv_service = setup_follower_service().await; + let result = kv_service.handle_get(b"foo".to_vec()).await; + assert!(matches!(result, Err(KvServiceError::NotLeader { .. }))); +} +``` + +**Test Cases**: +- `test_handle_get_returns_value_when_key_exists` +- `test_handle_get_returns_null_when_key_missing` +- `test_handle_get_on_follower_returns_not_leader` +- `test_handle_set_validates_key_size` +- `test_handle_set_validates_value_size` +- `test_handle_set_key_exactly_256_bytes_accepted` +- `test_handle_set_key_257_bytes_rejected` +- `test_handle_set_value_exactly_64kb_accepted` +- `test_handle_set_value_64kb_plus_1_rejected` +- `test_handle_del_single_key` +- `test_handle_del_multiple_keys_all_exist` +- `test_handle_del_multiple_keys_partial_exist` +- `test_handle_del_nonexistent_key_returns_zero` +- `test_handle_del_partial_failure_returns_count` +- `test_handle_del_key_size_validation` +- `test_handle_exists_multiple_keys` +- `test_handle_exists_no_keys_exist_returns_zero` +- `test_handle_exists_on_follower_returns_not_leader` +- `test_handle_ping_no_message_returns_pong` +- `test_handle_ping_with_message_echoes_back` +- `test_not_leader_error_includes_leader_id` +- `test_serialization_error_handling` (with protobuf errors) + +### Integration Tests + +**File**: `crates/kv/tests/integration_test.rs` (new file) + +**Approach**: Use real RaftNode in single-node cluster mode (no network) + +**Setup**: Create RaftNode with single peer, tick until leader, create KvService, test handlers + +**Test Cases**: +- `test_set_then_get_returns_value` +- `test_del_then_get_returns_null` +- `test_set_overwrites_existing_value` +- `test_exists_after_set_returns_one` +- `test_exists_after_del_returns_zero` +- `test_multiple_sets_in_sequence` +- `test_concurrent_operations` +- `test_read_during_leadership_transition` (verify stale read scenario) +- `test_del_multiple_keys_counts_correctly` +- `test_del_with_quorum_loss_midway` + +### Property Tests + +**File**: `crates/kv/tests/property_test.rs` (new file) + +**Approach**: Use proptest to generate random key/value sizes and binary data + +**Test Cases**: +- `prop_key_size_boundary_256_bytes` - Test around exact boundary +- `prop_value_size_boundary_64kb` - Test around exact boundary +- `prop_binary_data_roundtrip` - Arbitrary bytes in keys/values +- `prop_empty_keys_and_values` - Edge cases +- `prop_utf8_and_non_utf8_data` - Mixed encodings +- `prop_del_multiple_keys_count` - Random key counts, verify accurate counting + +### End-to-End Tests + +**Location**: Handled by seshat binary integration tests and chaos-testing feature + +**Scenarios**: Full 3-node cluster with real networking, partitions, failures + +**Out of Scope**: Not part of kv crate testing - tested at system level + +## Performance Considerations + +### Read Latency + +**Target**: < 5ms p99 for GET on leader + +**Optimization**: In-memory HashMap lookup (no disk I/O, no network) + +**Bottleneck**: RwLock contention under high read concurrency + +**Mitigation**: StateMachine uses RwLock to allow concurrent reads + +### Write Latency + +**Target**: < 10ms p99 for SET with Raft replication + +**Breakdown**: +- 1ms validation +- 2ms serialization +- 5ms network round-trip +- 2ms apply + +**Bottleneck**: Network latency for AppendEntries RPC to followers + +**Mitigation**: Batching multiple proposals into single Raft entry (Phase 2 optimization) + +### Multi-Key DEL Latency + +**Current**: N keys × 10ms = potentially high latency for large key counts + +**Trade-off**: Simplicity and correctness over throughput in Phase 1 + +**Future Optimization**: Phase 2 can batch into single `Operation::DelMulti` for N keys × 1 consensus round + +### Throughput + +**Target**: > 5,000 ops/sec per node + +**Read Throughput**: Limited by CPU (HashMap lookups) and RwLock contention + +**Write Throughput**: Limited by Raft consensus (can only commit as fast as majority responds) + +**Scaling**: Phase 2 adds multi-shard parallelism for horizontal write scaling + +### Memory Usage + +**State Machine**: HashMap size = total key-value data size (unbounded) + +**Raft Log**: Grows until snapshot (max ~100MB before compaction) + +**Operation Overhead**: Each Operation has ~48 bytes overhead (enum tag + Vec allocations) + +**Mitigation**: Phase 1 adds log compaction and snapshots to bound memory growth + +### Zero-Copy Opportunities + +**bytes Crate**: Use `bytes::Bytes` for key/value data to avoid copies + +**Limitation**: `HashMap, Vec>` currently requires owned data + +**Future**: Phase 3 could use `bytes::Bytes` in StateMachine for zero-copy reads + +## Implementation Dependencies + +### Requires (After OpenRaft Migration) + +- ✓ `RaftNode::propose()` → **NOW ASYNC**: `async fn propose(&self, data: Vec) -> Result` +- ✓ `RaftNode::get()` → **STILL SYNC**: `fn get(&self, key: &[u8]) -> Option>` +- ✓ `RaftNode::is_leader()` → **NOW ASYNC**: `async fn is_leader(&self) -> bool` +- ✓ `RaftNode::leader_id()` → **NOW ASYNC**: `async fn leader_id(&self) -> Option` +- ✓ `Operation::encode_to_vec()` - Protobuf serialization (replaces bincode) +- ✓ `Operation::decode()` - Protobuf deserialization (replaces bincode) +- ✓ `RespCommand` and `RespValue` - DONE (protocol-resp crate complete) + +### Blockers + +**OpenRaft migration MUST be complete** before KV Service implementation begins. Specifically: +- OpenRaft Phase 1-6 complete (see `docs/specs/openraft/`) +- RaftNode API migrated to openraft async methods +- StateMachine integrated with openraft +- All raft crate tests passing with openraft +- Protobuf serialization implemented for Operations + +### Provides + +- KvService struct for seshat binary integration +- Complete Redis command handlers (GET, SET, DEL, EXISTS, PING) +- Input validation and error handling layer +- RESP protocol integration with Raft consensus + +## Observability + +### Structured Logging + +**Framework**: tracing crate with `tracing::instrument` on all handler methods + +**Log Levels**: +- **ERROR**: Raft errors, serialization failures, invariant violations +- **WARN**: Leadership transitions, quorum loss, validation rejections, stale read attempts +- **INFO**: Command handling, leader redirection +- **DEBUG**: Key/value sizes, operation details, leadership checks +- **TRACE**: Full serialized operation bytes, detailed Raft interactions + +**Context**: Include node_id, command_type, key_size, value_size, is_leader in all logs + +### Metrics (Phase 4) + +OpenTelemetry metrics for production (not Phase 1): + +- `kv_commands_total{command, status}` - Counter of all commands +- `kv_command_duration_seconds{command}` - Histogram of latencies +- `kv_validation_errors_total{error_type}` - Counter of validation failures +- `kv_raft_proposals_total{status}` - Counter of Raft proposals (success/failure) +- `kv_not_leader_redirects_total` - Counter of MOVED responses +- `kv_key_size_bytes` - Histogram of key sizes +- `kv_value_size_bytes` - Histogram of value sizes +- `kv_stale_read_attempts_total` - Counter of reads during leadership transitions +- `kv_del_multi_key_count` - Histogram of keys per DEL command +- `kv_del_partial_failures_total` - Counter of DEL commands with partial success + +## Phase 1 Limitations + +**Read Consistency - Leadership Transition Race**: +- Reads may be stale during leadership transitions (window of ~100ms) +- While KvService checks `is_leader().await` before reads, leadership can change before `get()` completes +- `is_leader()` is async (checks openraft state) +- `get()` is sync (direct StateMachine HashMap access, no internal leadership check) +- **Race window**: Between `is_leader().await` returning true and `get()` executing +- Old leader may serve reads briefly after losing leadership +- New leader may not have all committed entries immediately after election +- Clients may observe "time travel": newer value → older value → newer value +- Phase 4 will add linearizable reads via ReadIndex to eliminate this race + +**No Follower Reads**: Only leader serves reads in Phase 1. Followers redirect to leader. Phase 4 adds follower reads with ReadIndex for better read scalability. + +**No Batching**: Each SET/DEL is separate Raft proposal. Phase 2 adds batching for throughput. + +**DEL Multi-Key Not Atomic**: Each key in DEL command requires separate Raft consensus round. Partial success is possible. Phase 2 can add `Operation::DelMulti` for atomic batching if needed. + +**No Pipelining**: Client waits for each operation to commit. Phase 3 adds request pipelining. + +**No TTL**: No expiration support. Phase 3 adds TTL for keys. + +**Simple Data Types**: Only byte arrays. Phase 3 adds Redis data types (lists, sets, hashes). + +## Next Steps + +1. Run `/spec:plan kvservice` to generate dependency-ordered implementation tasks +2. Implement KvService struct with `Arc` field +3. Implement command handlers with TDD approach +4. Add comprehensive unit tests for validation logic +5. Create integration tests with single-node Raft cluster +6. Add property tests for boundary conditions +7. Integrate with seshat binary for end-to-end testing diff --git a/docs/specs/kvservice/requirements.json b/docs/specs/kvservice/requirements.json new file mode 100644 index 0000000..fcc4ff3 --- /dev/null +++ b/docs/specs/kvservice/requirements.json @@ -0,0 +1,32 @@ +{ + "raw_user_story": "As a Redis client, I want to execute GET/SET/DEL commands over TCP so that I can store and retrieve key-value data in a distributed, fault-tolerant manner", + "raw_criteria": [ + "Main binary starts Tokio TCP server on port 6379", + "Server parses incoming RESP commands (GET, SET, DEL, EXISTS, PING)", + "KV service routes commands to Raft consensus layer", + "Raft commits operations and persists to RocksDB storage", + "Clients receive RESP-formatted responses" + ], + "raw_rules": [ + "Only Phase 1 commands supported (GET, SET, DEL, EXISTS, PING)", + "Operations must go through Raft consensus (strong consistency)", + "Failed operations return appropriate RESP error messages", + "Leader forwards write operations through Raft log" + ], + "raw_scope": { + "included": [ + "TCP server setup in main binary (seshat crate)", + "RESP command parsing integration (protocol-resp already exists)", + "KV service layer implementation (kv crate)", + "Raft/storage integration for GET/SET/DEL operations", + "End-to-end request flow from TCP -> RESP -> KV -> Raft -> RocksDB" + ], + "excluded": [ + "Multi-shard clustering (Phase 2)", + "Advanced Redis commands beyond Phase 1 scope", + "Dynamic cluster management (Phase 3)", + "Observability/metrics (Phase 4)", + "SQL interface (Phase 5)" + ] + } +} diff --git a/docs/specs/kvservice/spec-lite.md b/docs/specs/kvservice/spec-lite.md new file mode 100644 index 0000000..6efe14c --- /dev/null +++ b/docs/specs/kvservice/spec-lite.md @@ -0,0 +1,69 @@ +# KV Service (Condensed Spec) + +## User Story + +As a Redis client, I want to execute GET/SET/DEL commands over TCP so that I can store and retrieve key-value data in a distributed, fault-tolerant manner using strong consistency guarantees + +## Key Acceptance Criteria + +1. TCP server on port 6379 accepts Redis client connections +2. SET commands replicate via Raft consensus to quorum of nodes +3. GET commands read from leader's local StateMachine +4. Non-leader nodes return MOVED errors with leader ID +5. Key/value size limits enforced (256 bytes / 64KB) +6. NOQUORUM errors when majority unavailable + +## Critical Business Rules + +- Only Phase 1 commands: GET, SET, DEL, EXISTS, PING +- All writes go through Raft consensus (strong consistency) +- Leader-only reads (no stale reads in Phase 1) +- Majority quorum required (2/3 nodes) +- Max key: 256 bytes, max value: 64KB +- Request timeout: 30s, Raft RPC timeout: 5s + +## Dependencies + +- protocol-resp crate (100% complete - RespCodec, RespCommand, RespValue) +- raft crate (in progress - RaftNode, gRPC transport, StateMachine) +- storage crate (in progress - MemStorage, migrating to RocksDB) +- common crate (NodeId, Error, Result, config types) +- seshat binary (TCP server orchestration) + +## Main Components + +- **seshat/main.rs**: TCP listener on port 6379 +- **kv/src/service.rs**: KvService with handle_command() +- **kv/src/handlers.rs**: Command handlers (handle_get, handle_set, etc.) +- **kv/src/validation.rs**: Size limit validation +- **seshat/src/server.rs**: Tokio TCP server with RespCodec + +## Request Flow + +### Write Path (SET) +``` +TCP → RespCodec::decode() → KvService → validate → RaftNode::propose() +→ Raft replication → StateMachine::apply() → Storage::put() → RocksDB +→ RespCodec::encode() → TCP response +``` + +### Read Path (GET) +``` +TCP → RespCodec::decode() → KvService → RaftNode::is_leader() +→ RaftNode::read_local() → StateMachine::get() → RespCodec::encode() → TCP response +``` + +## Error Handling + +- **NOT_LEADER**: `-MOVED {leader_id}\r\n` +- **NOQUORUM**: `-(error) NOQUORUM\r\n` +- **Key too large**: `-(error) ERR key too large\r\n` +- **Value too large**: `-(error) ERR value too large\r\n` +- **Timeout**: `-(error) ERR timeout\r\n` + +## Success Criteria + +- redis-cli connects and executes GET/SET/DEL/EXISTS/PING +- Writes replicate to all 3 nodes via Raft +- Leader failure → new leader → operations continue +- Performance: >5,000 ops/sec, GET <5ms p99, SET <10ms p99 diff --git a/docs/specs/kvservice/spec.json b/docs/specs/kvservice/spec.json new file mode 100644 index 0000000..cf38204 --- /dev/null +++ b/docs/specs/kvservice/spec.json @@ -0,0 +1,186 @@ +{ + "feature": "kvservice", + "user_story": "As a Redis client, I want to execute GET/SET/DEL commands over TCP so that I can store and retrieve key-value data in a distributed, fault-tolerant manner using strong consistency guarantees", + "acceptance_criteria": [ + "GIVEN the seshat binary starts WHEN the TCP server binds to port 6379 THEN Redis clients can successfully connect and establish sessions", + "GIVEN a Redis client sends a SET command WHEN the command is parsed by RespCodec THEN KvService receives a valid RespCommand::Set with key and value", + "GIVEN KvService receives a SET command WHEN it calls RaftNode::propose() THEN the operation is replicated to a quorum of nodes and committed via Raft consensus", + "GIVEN a SET operation is committed WHEN the StateMachine applies the entry THEN the key-value pair is persisted to the data_kv column family in RocksDB", + "GIVEN a Redis client sends a GET command WHEN the node is the leader THEN it reads from the local StateMachine and returns the value in RESP format", + "GIVEN a Redis client sends a command WHEN the node is not the leader THEN it returns a MOVED error with the leader's node ID", + "GIVEN a Redis client sends a DEL command WHEN it is committed via Raft THEN the key is removed from storage and the deletion count is returned", + "GIVEN a Redis client sends an EXISTS command WHEN checked against storage THEN it returns the count of existing keys", + "GIVEN a Redis client sends a PING command THEN it receives a PONG response (or the echo message if provided)", + "GIVEN any operation fails due to quorum loss WHEN processed THEN the client receives a NOQUORUM error response", + "GIVEN a key exceeds 256 bytes WHEN validated THEN the client receives 'ERR key too large' error", + "GIVEN a value exceeds 64KB WHEN validated THEN the client receives 'ERR value too large' error" + ], + "business_rules": [ + "Only Phase 1 commands are supported: GET, SET, DEL, EXISTS, PING", + "All write operations (SET, DEL) must go through Raft consensus for strong consistency", + "Read operations (GET, EXISTS) are performed on the leader only (no stale reads in Phase 1)", + "Operations must achieve a majority quorum (2 out of 3 nodes) to commit", + "Non-leader nodes must redirect clients to the current leader using MOVED errors", + "Maximum key size is 256 bytes (enforced before Raft proposal)", + "Maximum value size is 64KB (enforced before Raft proposal)", + "Failed operations return appropriate RESP error messages with context", + "The leader node forwards write operations through the Raft log", + "Committed operations are applied to the StateMachine in log order", + "PING commands bypass Raft consensus and return immediately", + "Request timeout is 30 seconds (from configuration)", + "Raft RPC timeout is 5 seconds (from configuration)" + ], + "scope": { + "included": [ + "TCP server initialization in seshat main binary on port 6379", + "Integration with protocol-resp crate for RESP command parsing and response encoding", + "KvService struct in kv crate for command routing and validation", + "Command validation (key/value size limits, command syntax)", + "Integration with raft crate (RaftNode::propose() for writes, read_local() for reads)", + "Error handling for NOT_LEADER scenarios (return MOVED errors with leader ID)", + "Error handling for NOQUORUM scenarios (cannot reach majority)", + "Error handling for invalid commands (syntax errors, size violations)", + "End-to-end request flow: TCP → RespCodec → KvService → RaftNode → StateMachine → RocksDB", + "Response formatting in RESP protocol", + "Support for GET, SET, DEL, EXISTS, PING commands", + "Leader-only read path for strong consistency", + "Write path through Raft consensus with quorum requirement" + ], + "excluded": [ + "Advanced Redis commands beyond Phase 1 scope (TTL, EXPIRE, etc.)", + "Multi-shard clustering (Phase 2 feature)", + "Stale reads from followers (Phase 1 uses leader-only reads)", + "Dynamic cluster management (adding/removing nodes during runtime - Phase 3)", + "Observability metrics and tracing (Phase 4 feature)", + "SQL interface (Phase 5 feature)", + "Redis Cluster protocol support (CLUSTER commands)", + "Redis pub/sub functionality", + "Redis transactions (MULTI/EXEC)", + "Redis pipelining optimization (Phase 1 processes commands sequentially)" + ] + }, + "aligns_with": "Phase 1 MVP goals from product vision: Enable Redis clients to execute basic commands (GET, SET, DEL, EXISTS, PING) against a distributed, fault-tolerant 3-node cluster with strong consistency guarantees via Raft consensus", + "dependencies": [ + "protocol-resp crate (100% complete - provides RespCodec, RespCommand, RespValue for parsing and encoding)", + "raft crate (in progress - provides RaftNode wrapper, gRPC transport, StateMachine, MemStorage)", + "storage crate (in progress - provides MemStorage implementation of raft::Storage trait, will migrate to RocksDB)", + "common crate (provides shared types: NodeId, Error, Result, configuration types)", + "seshat binary (orchestration - needs TCP server on port 6379 that routes to KvService)" + ], + "technical_details": { + "components": [ + "seshat/main.rs: Main binary that starts TCP listener on port 6379", + "kv/src/service.rs: KvService struct with handle_command() method", + "kv/src/handlers.rs: Individual command handlers (handle_get, handle_set, handle_del, etc.)", + "kv/src/validation.rs: Input validation (key/value size limits)", + "kv/src/error.rs: KV-specific error types", + "seshat/src/server.rs: TCP server using Tokio with RespCodec framing" + ], + "integration_points": [ + "RespCodec::decode() from protocol-resp crate for parsing incoming RESP commands", + "RespCodec::encode() from protocol-resp crate for serializing RESP responses", + "RaftNode::propose(operation: Vec) for write operations (SET, DEL)", + "RaftNode::read_local(key: Vec) for read operations (GET, EXISTS)", + "RaftNode::is_leader() to check leadership status", + "RaftNode::leader_id() to get current leader for MOVED errors", + "StateMachine::apply(entry: Entry) applies committed operations to storage", + "Storage::get(cf: &str, key: &[u8]) reads from data_kv column family", + "Storage::put(cf: &str, key: &[u8], value: &[u8]) writes to data_kv column family" + ], + "error_handling": [ + "MOVED errors when not leader: Return '-MOVED {leader_id}\\r\\n' in RESP format", + "NOQUORUM errors when cannot reach majority: Return '-(error) NOQUORUM\\r\\n'", + "Key size validation: Return '-(error) ERR key too large\\r\\n' if key > 256 bytes", + "Value size validation: Return '-(error) ERR value too large\\r\\n' if value > 64KB", + "Invalid command syntax: Return '-(error) ERR unknown command\\r\\n'", + "Raft proposal timeout: Return '-(error) ERR timeout\\r\\n' after 30 seconds", + "Storage errors: Return '-(error) ERR storage failure\\r\\n' for RocksDB errors", + "Connection errors: Close TCP connection and log error" + ], + "data_flow": { + "write_path": [ + "1. Client sends 'SET foo bar' over TCP connection", + "2. Tokio TCP listener receives bytes", + "3. RespCodec::decode() parses to RespCommand::Set { key: b'foo', value: b'bar' }", + "4. KvService::handle_command(RespCommand::Set) called", + "5. KvService::validate_key_size(b'foo') - check <= 256 bytes", + "6. KvService::validate_value_size(b'bar') - check <= 64KB", + "7. KvService creates Operation::Set { key: b'foo', value: b'bar' }", + "8. Serialize operation: let data = bincode::serialize(&operation)?", + "9. KvService calls RaftNode::propose(data)", + "10. RaftNode checks is_leader() - if false, return NotLeader(leader_id)", + "11. RaftNode::propose() calls raft_rs::RawNode::propose(entry)", + "12. raft-rs replicates entry to followers via gRPC (AppendEntries RPC)", + "13. Once majority commits, raft-rs calls StateMachine::apply(entry)", + "14. StateMachine deserializes operation from entry.data", + "15. StateMachine::apply_set(key, value) calls Storage::put('data_kv', key, value)", + "16. RocksDB writes to data_kv column family", + "17. RaftNode::propose() returns Ok(())", + "18. KvService returns RespValue::SimpleString('OK')", + "19. RespCodec::encode() serializes to '+OK\\r\\n'", + "20. Bytes sent back to client over TCP" + ], + "read_path": [ + "1. Client sends 'GET foo' over TCP connection", + "2. Tokio TCP listener receives bytes", + "3. RespCodec::decode() parses to RespCommand::Get { key: b'foo' }", + "4. KvService::handle_command(RespCommand::Get) called", + "5. KvService calls RaftNode::is_leader()", + "6. If not leader: return NotLeader(leader_id) → format MOVED error", + "7. If leader: KvService calls RaftNode::read_local(b'foo')", + "8. RaftNode accesses StateMachine (in-memory HashMap in Phase 1)", + "9. StateMachine::get(b'foo') returns Option>", + "10. If Some(value): return RespValue::BulkString(value)", + "11. If None: return RespValue::Null", + "12. RespCodec::encode() serializes to '$3\\r\\nbar\\r\\n' or '$-1\\r\\n'", + "13. Bytes sent back to client over TCP" + ] + }, + "architecture_layers": [ + "Layer 1 (Protocol): protocol-resp crate parses RESP commands and encodes responses", + "Layer 2 (Service): kv crate validates commands and routes to Raft", + "Layer 3 (Consensus): raft crate replicates writes and manages state machine", + "Layer 4 (Storage): storage crate persists data to RocksDB (or MemStorage in development)", + "Layer 5 (Transport): TCP server in seshat binary handles client connections" + ], + "performance_considerations": [ + "TCP connection pooling: Tokio handles concurrent connections efficiently", + "Zero-copy parsing: RespCodec uses bytes::Bytes to avoid allocations", + "Async I/O: All network operations use tokio::net::TcpListener and async/await", + "Batching: Raft batches log entries internally for replication efficiency", + "Leader reads: Avoid network round-trip by reading from local StateMachine", + "Connection limits: Max 10,000 concurrent client connections (from configuration)", + "Request timeout: 30 seconds prevents client connections from hanging indefinitely", + "Raft RPC timeout: 5 seconds for internal node communication" + ], + "testing_strategy": [ + "Unit tests: KvService command handlers with mock RaftNode", + "Integration tests: Full request flow with in-memory Raft cluster", + "End-to-end tests: Redis client (redis-cli or redis-rs) against running cluster", + "Error scenario tests: NOT_LEADER, NOQUORUM, size limit violations", + "Concurrency tests: Multiple concurrent SET operations maintain consistency", + "Chaos tests: Leader failure during SET operation (no data loss)", + "Performance tests: redis-benchmark compatibility, measure ops/sec and latency" + ] + }, + "success_criteria": [ + "redis-cli can connect to any node in the cluster on port 6379", + "GET/SET/DEL/EXISTS/PING commands execute successfully with correct RESP responses", + "Write operations (SET, DEL) replicate to all 3 nodes via Raft consensus", + "Read operations (GET, EXISTS) return consistent values from the leader", + "Non-leader nodes correctly redirect clients with MOVED errors", + "Key size and value size limits are enforced with appropriate error messages", + "NOQUORUM errors returned when majority of nodes are unavailable", + "Cluster passes end-to-end integration test: SET on node 1 → GET from node 2 returns same value", + "Cluster survives leader failure: Kill leader → new leader elected → SET/GET continue working", + "Performance targets met: >5,000 ops/sec, GET <5ms p99, SET <10ms p99" + ], + "future_enhancements": [ + "Phase 2: Multi-shard support (route commands to appropriate shard based on key hash)", + "Phase 2: Cross-shard commands (MGET, MSET)", + "Phase 3: Dynamic membership (add/remove nodes via CLUSTER commands)", + "Phase 4: Follower reads with bounded staleness (trade consistency for read scalability)", + "Phase 4: Observability (OpenTelemetry metrics, distributed tracing)", + "Phase 5: SQL interface (parallel service layer using same Raft/storage infrastructure)" + ] +} diff --git a/docs/specs/kvservice/spec.md b/docs/specs/kvservice/spec.md new file mode 100644 index 0000000..e43e241 --- /dev/null +++ b/docs/specs/kvservice/spec.md @@ -0,0 +1,251 @@ +# KV Service Specification + +## User Story + +As a Redis client, I want to execute GET/SET/DEL commands over TCP so that I can store and retrieve key-value data in a distributed, fault-tolerant manner using strong consistency guarantees + +## Acceptance Criteria + +1. GIVEN the seshat binary starts WHEN the TCP server binds to port 6379 THEN Redis clients can successfully connect and establish sessions +2. GIVEN a Redis client sends a SET command WHEN the command is parsed by RespCodec THEN KvService receives a valid RespCommand::Set with key and value +3. GIVEN KvService receives a SET command WHEN it calls RaftNode::propose() THEN the operation is replicated to a quorum of nodes and committed via Raft consensus +4. GIVEN a SET operation is committed WHEN the StateMachine applies the entry THEN the key-value pair is persisted to the data_kv column family in RocksDB +5. GIVEN a Redis client sends a GET command WHEN the command is processed THEN it reads from the local StateMachine and returns the value in RESP format (eventual consistency - reads may be stale during leadership transitions with ~100ms window between is_leader() check and actual read operation) +6. GIVEN a Redis client sends a command WHEN the node is not the leader THEN it returns a MOVED error with the leader's node ID +7. GIVEN a Redis client sends a DEL command with one or more keys WHEN each key is committed via separate Raft proposals THEN the keys are removed from storage and the total deletion count is returned (matching Redis semantics where DEL is not atomic across multiple keys) +8. GIVEN a Redis client sends a DEL command with multiple keys WHEN a quorum loss occurs mid-operation THEN partial deletions are committed and the count reflects only successfully deleted keys +9. GIVEN a Redis client sends an EXISTS command WHEN checked against storage THEN it returns the count of existing keys +10. GIVEN a Redis client sends a PING command THEN it receives a PONG response (or the echo message if provided) +11. GIVEN any operation fails due to quorum loss WHEN processed THEN the client receives a NOQUORUM error response +12. GIVEN a key exceeds 256 bytes WHEN validated THEN the client receives 'ERR key too large' error +13. GIVEN a value exceeds 64KB WHEN validated THEN the client receives 'ERR value too large' error + +## Business Rules + +- Only Phase 1 commands are supported: GET, SET, DEL, EXISTS, PING +- All write operations (SET, DEL) must go through Raft consensus for strong consistency +- Read operations (GET, EXISTS) use eventual consistency model - reads are served from the local StateMachine and may be stale during leadership transitions (~100ms window). Linearizable reads (via ReadIndex) are deferred to Phase 4 +- Operations must achieve a majority quorum (2 out of 3 nodes) to commit +- Non-leader nodes must redirect clients to the current leader using MOVED errors +- Maximum key size is 256 bytes (enforced before Raft proposal) +- Maximum value size is 64KB (enforced before Raft proposal) +- Failed operations return appropriate RESP error messages with context +- The leader node forwards write operations through the Raft log +- Committed operations are applied to the StateMachine in log order +- PING commands bypass Raft consensus and return immediately +- Request timeout is 30 seconds (from configuration) +- Raft RPC timeout is 5 seconds (from configuration) +- **DEL multi-key operations**: Each key is processed with a separate Raft proposal (not atomic across keys), matching Redis behavior where partial success is possible +- **DEL partial failures**: If a proposal fails mid-operation, the deletion count reflects only successfully committed deletions up to that point + +## Scope + +### Included +- TCP server initialization in seshat main binary on port 6379 +- Integration with protocol-resp crate for RESP command parsing and response encoding +- KvService struct in kv crate for command routing and validation +- Command validation (key/value size limits, command syntax) +- Integration with raft crate (RaftNode::propose() for writes, read_local() for reads) +- Error handling for NOT_LEADER scenarios (return MOVED errors with leader ID) +- Error handling for NOQUORUM scenarios (cannot reach majority) +- Error handling for invalid commands (syntax errors, size violations) +- End-to-end request flow: TCP → RespCodec → KvService → RaftNode → StateMachine → RocksDB +- Response formatting in RESP protocol +- Support for GET, SET, DEL, EXISTS, PING commands +- Leader-only read path for strong consistency +- Write path through Raft consensus with quorum requirement +- Multi-key DEL with separate Raft proposals per key (non-atomic, allows partial success) + +### Excluded +- Advanced Redis commands beyond Phase 1 scope (TTL, EXPIRE, etc.) +- Multi-shard clustering (Phase 2 feature) +- Stale reads from followers (Phase 1 uses leader-only reads) +- Dynamic cluster management (adding/removing nodes during runtime - Phase 3) +- Observability metrics and tracing (Phase 4 feature) +- SQL interface (Phase 5 feature) +- Redis Cluster protocol support (CLUSTER commands) +- Redis pub/sub functionality +- Redis transactions (MULTI/EXEC) +- Redis pipelining optimization (Phase 1 processes commands sequentially) +- Atomic multi-key DEL operations (Phase 2 optimization - can add Operation::DelMulti if needed) + +## Dependencies + +- protocol-resp crate (100% complete - provides RespCodec, RespCommand, RespValue for parsing and encoding) +- **openraft migration (BLOCKING)** - must complete before KV service implementation begins +- raft crate (in progress - provides RaftNode wrapper with openraft async APIs, gRPC transport, StateMachine, MemStorage) +- storage crate (in progress - provides MemStorage implementation of raft::Storage trait, will migrate to RocksDB) +- common crate (provides shared types: NodeId, Error, Result, configuration types) +- tokio 1.x - async runtime for all I/O operations +- prost - protobuf serialization (replaces bincode) +- seshat binary (orchestration - needs TCP server on port 6379 that routes to KvService) + +## Technical Details + +### Components + +- **seshat/main.rs**: Main binary that starts TCP listener on port 6379 +- **kv/src/service.rs**: KvService struct with handle_command() method +- **kv/src/handlers.rs**: Individual command handlers (handle_get, handle_set, handle_del, etc.) +- **kv/src/validation.rs**: Input validation (key/value size limits) +- **kv/src/error.rs**: KV-specific error types +- **seshat/src/server.rs**: TCP server using Tokio with RespCodec framing + +### Integration Points + +- **RespCodec::decode()**: From protocol-resp crate for parsing incoming RESP commands +- **RespCodec::encode()**: From protocol-resp crate for serializing RESP responses +- **RaftNode::propose(operation: Vec) -> Result**: **ASYNC** - For write operations (SET, DEL) +- **RaftNode::get(key: &[u8]) -> Option>**: **SYNC** - Direct StateMachine access for reads (GET, EXISTS) - **WARNING: Does NOT check leadership internally** +- **RaftNode::is_leader() -> bool**: **ASYNC** - To check leadership status before reads +- **RaftNode::leader_id() -> Option**: **ASYNC** - To get current leader for MOVED errors +- **StateMachine::apply(entry: Entry)**: Applies committed operations to storage +- **Storage::get(cf: &str, key: &[u8])**: Reads from data_kv column family +- **Storage::put(cf: &str, key: &[u8], value: &[u8])**: Writes to data_kv column family + +### Error Handling + +- **MOVED errors when not leader**: Return `-MOVED {leader_id}\r\n` in RESP format +- **NOQUORUM errors when cannot reach majority**: Return `-(error) NOQUORUM\r\n` +- **Key size validation**: Return `-(error) ERR key too large\r\n` if key > 256 bytes +- **Value size validation**: Return `-(error) ERR value too large\r\n` if value > 64KB +- **Invalid command syntax**: Return `-(error) ERR unknown command\r\n` +- **Raft proposal timeout**: Return `-(error) ERR timeout\r\n` after 30 seconds +- **Storage errors**: Return `-(error) ERR storage failure\r\n` for RocksDB errors +- **Connection errors**: Close TCP connection and log error +- **DEL partial failure**: Return deletion count with error context (e.g., NOQUORUM after 2 of 3 keys deleted) + +### Data Flow + +#### Write Path + +1. Client sends 'SET foo bar' over TCP connection +2. Tokio TCP listener receives bytes +3. RespCodec::decode() parses to RespCommand::Set { key: b'foo', value: b'bar' } +4. KvService::handle_command(RespCommand::Set) called +5. KvService::validate_key_size(b'foo') - check <= 256 bytes +6. KvService::validate_value_size(b'bar') - check <= 64KB +7. KvService creates Operation::Set { key: b'foo', value: b'bar' } +8. Serialize operation with protobuf: let data = operation.encode_to_vec() +9. KvService calls raft_node.propose(data).await (async with openraft) +10. RaftNode checks leadership internally (openraft handles this) +11. openraft appends entry to local Raft log +12. openraft replicates entry to followers via gRPC (AppendEntries RPC) +13. Once majority commits, openraft calls StateMachine::apply(entry) +14. StateMachine deserializes operation from entry.data using protobuf +15. StateMachine::apply_set(key, value) calls Storage::put('data_kv', key, value) +16. RocksDB writes to data_kv column family +17. RaftNode::propose() returns Ok(ClientWriteResponse) +18. KvService returns RespValue::SimpleString('OK') +19. RespCodec::encode() serializes to '+OK\r\n' +20. Bytes sent back to client over TCP + +#### Read Path + +1. Client sends 'GET foo' over TCP connection +2. Tokio TCP listener receives bytes +3. RespCodec::decode() parses to RespCommand::Get { key: b'foo' } +4. KvService::handle_command(RespCommand::Get) called +5. KvService MUST check raft_node.is_leader().await (async with openraft) +6. If not leader: return NotLeader(leader_id) → format MOVED error +7. If leader: KvService calls RaftNode::get(b'foo') (sync - direct HashMap access) +8. RaftNode accesses StateMachine (in-memory HashMap in Phase 1) - NO internal leadership check +9. StateMachine::get(b'foo') returns Option> +10. If Some(value): return RespValue::BulkString(value) +11. If None: return RespValue::Null +12. RespCodec::encode() serializes to `$3\r\nbar\r\n` or `$-1\r\n` +13. Bytes sent back to client over TCP + +**IMPORTANT**: There is a race condition window (~100ms) between Step 5 (is_leader() check) and Step 7 (get() call) where leadership may change. This provides eventual consistency. Phase 4 will add ReadIndex for linearizable reads. + +#### Multi-Key DEL Path + +1. Client sends 'DEL key1 key2 key3' over TCP connection +2. Tokio TCP listener receives bytes +3. RespCodec::decode() parses to RespCommand::Del { keys: vec![b'key1', b'key2', b'key3'] } +4. KvService::handle_command(RespCommand::Del) called +5. KvService validates all key sizes <= 256 bytes +6. deleted_count = 0 +7. For each key: + - Create Operation::Del { key } + - Serialize operation with protobuf: operation.encode_to_vec() + - Call RaftNode::propose(serialized_op).await (async with openraft) + - Wait for commit + - If success: parse result (b"1" or b"0"), add to deleted_count + - If failure (NotLeader, NoQuorum, timeout): stop processing, return error with partial count +8. Return RespValue::Integer(deleted_count) +9. RespCodec::encode() serializes to `:N\r\n` (where N = successfully deleted keys) +10. Bytes sent back to client over TCP + +### Architecture Layers + +- **Layer 1 (Protocol)**: protocol-resp crate parses RESP commands and encodes responses +- **Layer 2 (Service)**: kv crate validates commands and routes to Raft +- **Layer 3 (Consensus)**: raft crate replicates writes and manages state machine +- **Layer 4 (Storage)**: storage crate persists data to RocksDB (or MemStorage in development) +- **Layer 5 (Transport)**: TCP server in seshat binary handles client connections + +### Performance Considerations + +- **TCP connection pooling**: Tokio handles concurrent connections efficiently +- **Zero-copy parsing**: RespCodec uses bytes::Bytes to avoid allocations +- **Async I/O**: All network operations use tokio::net::TcpListener and async/await +- **Batching**: Raft batches log entries internally for replication efficiency +- **Leader reads**: Avoid network round-trip by reading from local StateMachine +- **Connection limits**: Max 10,000 concurrent client connections (from configuration) +- **Request timeout**: 30 seconds prevents client connections from hanging indefinitely +- **Raft RPC timeout**: 5 seconds for internal node communication +- **Multi-key DEL performance**: N keys require N Raft consensus rounds (Phase 2 can optimize with batching) + +### Testing Strategy + +- **Unit tests**: KvService command handlers with mock RaftNode +- **Integration tests**: Full request flow with in-memory Raft cluster +- **End-to-end tests**: Redis client (redis-cli or redis-rs) against running cluster +- **Error scenario tests**: NOT_LEADER, NOQUORUM, size limit violations +- **Concurrency tests**: Multiple concurrent SET operations maintain consistency +- **Chaos tests**: Leader failure during SET operation (no data loss) +- **Performance tests**: redis-benchmark compatibility, measure ops/sec and latency +- **Multi-key DEL tests**: Verify correct counting with partial success scenarios + +## Success Criteria + +- redis-cli can connect to any node in the cluster on port 6379 +- GET/SET/DEL/EXISTS/PING commands execute successfully with correct RESP responses +- Write operations (SET, DEL) replicate to all 3 nodes via Raft consensus +- Read operations (GET, EXISTS) return consistent values from the leader +- Non-leader nodes correctly redirect clients with MOVED errors +- Key size and value size limits are enforced with appropriate error messages +- NOQUORUM errors returned when majority of nodes are unavailable +- Cluster passes end-to-end integration test: SET on node 1 → GET from node 2 returns same value +- Cluster survives leader failure: Kill leader → new leader elected → SET/GET continue working +- Performance targets met: >5,000 ops/sec, GET <5ms p99, SET <10ms p99 +- Multi-key DEL returns accurate count matching number of successfully deleted keys +- Multi-key DEL handles partial failures correctly (returns count + error for remaining keys) + +## Future Enhancements + +- **Phase 2**: Multi-shard support (route commands to appropriate shard based on key hash) +- **Phase 2**: Cross-shard commands (MGET, MSET) +- **Phase 2**: Batched DEL operations (Operation::DelMulti for atomic multi-key deletion) +- **Phase 3**: Dynamic membership (add/remove nodes via CLUSTER commands) +- **Phase 4**: Linearizable reads via ReadIndex (eliminates leadership race condition) +- **Phase 4**: Follower reads with bounded staleness (trade consistency for read scalability) +- **Phase 4**: Observability (OpenTelemetry metrics, distributed tracing) +- **Phase 5**: SQL interface (parallel service layer using same Raft/storage infrastructure) + +## Estimated Effort + +**11-13 hours** (updated from 10-12 hours to account for OpenRaft migration integration) +- 1 hour: KvService struct and basic setup +- 2 hours: Command handler implementation with async/await +- 2 hours: Validation logic and error types (including OpenRaft error mapping) +- 3 hours: Unit tests (including async test setup with #[tokio::test]) +- 2 hours: Integration tests with single-node Raft cluster +- 1 hour: Property tests for boundary conditions +- 1-2 hours: seshat binary integration and end-to-end testing +- +1 hour: OpenRaft async integration (updating all RaftNode calls with .await, error mapping) + +## Alignment + +This feature aligns with: Phase 1 MVP goals from product vision: Enable Redis clients to execute basic commands (GET, SET, DEL, EXISTS, PING) against a distributed, fault-tolerant 3-node cluster with strong consistency guarantees via Raft consensus diff --git a/docs/specs/openraft/design.json b/docs/specs/openraft/design.json new file mode 100644 index 0000000..d9f24d2 --- /dev/null +++ b/docs/specs/openraft/design.json @@ -0,0 +1,443 @@ +{ + "migration_overview": { + "description": "Technical design for migrating from raft-rs to openraft in simplified scope", + "version": "1.0.0", + "created_at": "2025-10-26", + "goals": [ + "Replace raft-rs with openraft to eliminate prost version conflict", + "Keep MemStorage in-memory without RocksDB dependencies", + "Stub network transport for future gRPC integration", + "No KV service layer integration in this migration", + "Maintain existing test coverage (85+ tests)" + ], + "constraints": [ + "All storage remains in-memory using RwLock", + "Network transport is stubbed (no actual gRPC yet)", + "Maintain compatibility with existing StateMachine", + "Preserve idempotency checks and behavior" + ] + }, + + "type_system": { + "description": "OpenRaft type configuration and mapping from raft-rs", + "type_config": { + "file": "crates/raft/src/types.rs", + "implementation": { + "NodeId": "u64 (same as raft-rs node IDs)", + "Node": "BasicNode { addr: String } (simple node info)", + "Entry": "LogEntry (replaces eraftpb::Entry)", + "SnapshotData": "Vec (replaces eraftpb::Snapshot.data)", + "AsyncRuntime": "TokioRuntime (openraft built-in)" + }, + "type_definition": "pub struct RaftTypeConfig;\n\nimpl openraft::RaftTypeConfig for RaftTypeConfig {\n type NodeId = u64;\n type Node = BasicNode;\n type Entry = LogEntry;\n type SnapshotData = Vec;\n type AsyncRuntime = TokioRuntime;\n}" + }, + "request_response_types": { + "Request": { + "description": "Wraps Operation for Raft proposals", + "fields": ["data: Vec"], + "serde": true + }, + "Response": { + "description": "Result from state machine apply", + "fields": ["result: Vec"], + "serde": true + }, + "BasicNode": { + "description": "Simple node metadata", + "fields": ["addr: String"], + "serde": true + } + }, + "conversions": { + "eraftpb_to_openraft": { + "Entry": "LogEntry::new(log_id, Request { data: entry.data })", + "HardState": "Vote { term, node_id: vote }, commit: log_id", + "ConfState": "Membership::new(voters, learners)", + "Message": "Not needed - openraft handles internally" + }, + "openraft_to_storage": { + "LogId": "{ term, index }", + "Vote": "{ term, node_id }", + "Membership": "{ voters: BTreeSet, learners: BTreeSet }" + } + } + }, + + "storage_layer": { + "description": "Mapping MemStorage to openraft storage traits", + "traits_to_implement": [ + "RaftLogReader", + "RaftSnapshotBuilder", + "RaftStorage" + ], + "struct_definition": { + "name": "OpenRaftMemStorage", + "fields": { + "vote": "RwLock>>", + "log": "RwLock>>", + "snapshot": "RwLock>>", + "state_machine": "RwLock", + "membership": "RwLock>" + } + }, + "trait_implementations": { + "RaftLogReader": { + "methods": { + "get_log_state": "async fn() -> Result>", + "try_get_log_entries": "async fn(range) -> Result>>", + "read_vote": "async fn() -> Result>>" + }, + "implementation_notes": "Read from RwLock fields, convert to openraft types" + }, + "RaftSnapshotBuilder": { + "methods": { + "build_snapshot": "async fn() -> Result>" + }, + "implementation_notes": "Call StateMachine::snapshot(), wrap in openraft Snapshot" + }, + "RaftStorage": { + "methods": { + "save_vote": "async fn(vote: &Vote) -> Result<()>", + "append": "async fn(entries: &[LogEntry]) -> Result<()>", + "delete_conflict_logs_since": "async fn(log_id: LogId) -> Result<()>", + "purge_logs_upto": "async fn(log_id: LogId) -> Result<()>", + "apply_to_state_machine": "async fn(entries: &[LogEntry]) -> Result>", + "begin_receiving_snapshot": "async fn() -> Result>", + "install_snapshot": "async fn(meta, snapshot: Box) -> Result<()>", + "get_current_snapshot": "async fn() -> Result>>", + "get_membership_config": "async fn() -> Result>" + }, + "implementation_notes": "Bridge to existing MemStorage pattern, maintain idempotency" + } + }, + "migration_mapping": { + "set_hard_state": "save_vote() + internal commit tracking", + "set_conf_state": "Update membership field", + "append_entries": "append() method", + "entries(low, high)": "try_get_log_entries()", + "term(idx)": "Extract from log entry at index", + "first_index/last_index": "Calculate from BTreeMap keys", + "snapshot()": "get_current_snapshot()" + } + }, + + "state_machine": { + "description": "Integrate existing StateMachine with openraft", + "trait": "openraft::RaftStateMachine", + "wrapper_struct": { + "name": "OpenRaftStateMachine", + "fields": ["inner: Arc>"] + }, + "methods": { + "apply": { + "signature": "async fn(&mut self, entries: &[LogEntry]) -> Result>", + "implementation": "for entry in entries {\n let data = &entry.payload.data;\n let result = self.inner.write().unwrap().apply(entry.log_id.index, data)?;\n responses.push(Response { result });\n}" + }, + "snapshot": { + "signature": "async fn(&mut self) -> Result>", + "implementation": "self.inner.read().unwrap().snapshot()" + }, + "restore": { + "signature": "async fn(&mut self, snapshot: &[u8]) -> Result<()>", + "implementation": "self.inner.write().unwrap().restore(snapshot)" + } + }, + "idempotency": { + "description": "Preserve existing idempotency checks", + "mechanism": "StateMachine::apply() already checks index > last_applied", + "preserved_behavior": "Reject duplicate or out-of-order entries" + } + }, + + "raft_node_wrapper": { + "description": "Migrate RaftNode from RawNode to openraft::Raft", + "struct_changes": { + "old": "raw_node: RawNode", + "new": "raft: Raft", + "storage": "storage: Arc", + "config": "config: Arc" + }, + "method_migration": { + "new()": { + "changes": "Create openraft Config, build Raft instance", + "implementation": "let config = Config {\n election_timeout_min: 150,\n election_timeout_max: 300,\n heartbeat_interval: 50,\n ..Default::default()\n};\nlet raft = Raft::new(id, Arc::new(config), network, storage).await?;" + }, + "tick()": { + "changes": "Remove - openraft handles timing internally", + "replacement": "Automatic via tokio runtime" + }, + "propose()": { + "changes": "Use raft.client_write() instead", + "implementation": "let request = ClientWriteRequest::new(Request { data });\nself.raft.client_write(request).await?" + }, + "handle_ready()": { + "changes": "Remove - openraft handles this internally", + "replacement": "Callbacks in RaftStorage trait" + }, + "is_leader()": { + "changes": "Use raft.is_leader()", + "implementation": "self.raft.is_leader().await" + }, + "leader_id()": { + "changes": "Use raft.current_leader()", + "implementation": "self.raft.current_leader().await" + }, + "get()": { + "changes": "Direct access to storage.state_machine", + "implementation": "self.storage.state_machine.read().unwrap().get(key)" + } + }, + "api_compatibility": { + "description": "Maintain backward compatibility where possible", + "breaking_changes": [ + "Methods become async (tick, propose, is_leader, leader_id)", + "handle_ready() removed - logic moved to storage callbacks", + "Message handling moved to network layer" + ], + "preserved": [ + "new() signature (add async)", + "get() remains synchronous", + "Same error types via conversion" + ] + } + }, + + "network_stub": { + "description": "Minimal RaftNetwork implementation for future gRPC", + "struct": { + "name": "StubNetwork", + "fields": ["node_id: u64"] + }, + "trait_implementation": { + "send_append_entries": "async fn() -> Result { Ok(Default::default()) }", + "send_vote": "async fn() -> Result { Ok(Default::default()) }", + "send_install_snapshot": "async fn() -> Result { Ok(Default::default()) }" + }, + "future_integration": { + "description": "Replace stub with actual gRPC transport", + "location": "crates/raft/src/transport.rs", + "reuse": "Existing protobuf definitions and client pool" + } + }, + + "dependencies": { + "description": "Cargo.toml changes", + "remove": [ + "raft = { version = \"0.7\", ... }", + "prost-old = { package = \"prost\", version = \"0.11\" }", + "slog = \"2\"" + ], + "add": [ + "openraft = { version = \"0.10\", features = [\"tokio\"] }", + "tracing = \"0.1\"", + "async-trait = \"0.1\"" + ], + "keep": [ + "tokio = { version = \"1\", features = [\"full\"] }", + "serde = { version = \"1\", features = [\"derive\"] }", + "bincode = \"1.3\"", + "tonic = \"0.14\"", + "prost = \"0.14\"" + ], + "rationale": { + "openraft": "Modern async Raft with no prost conflicts", + "tracing": "Replace slog, openraft uses tracing", + "async-trait": "Required for openraft trait implementations" + } + }, + + "test_migration": { + "description": "Strategy for migrating 85+ tests", + "categories": { + "storage_tests": { + "count": "~50 tests", + "changes": "Update to use OpenRaftMemStorage API", + "example": "test_append_entries() -> async test using storage.append()" + }, + "node_tests": { + "count": "~35 tests", + "changes": "Make async, remove tick/handle_ready tests", + "example": "test_propose() -> async test using raft.client_write()" + }, + "state_machine_tests": { + "count": "~15 tests", + "changes": "Minimal - wrap in async runtime", + "example": "Keep existing apply() tests" + } + }, + "test_utilities": { + "create_test_cluster": "async fn() -> (Raft, Raft, Raft)", + "wait_for_leader": "async fn(raft: &Raft) -> u64", + "propose_and_wait": "async fn(raft: &Raft, data: Vec) -> Response" + }, + "removed_tests": [ + "tick() tests - openraft handles internally", + "handle_ready() tests - no longer exists", + "Message processing - moved to network layer" + ], + "new_tests": [ + "Async proposal handling", + "Automatic leader election", + "Membership changes via openraft API" + ] + }, + + "implementation_phases": { + "phase_1": { + "name": "Type System Setup", + "duration": "2-3 hours", + "files": [ + "crates/raft/src/types.rs (new)", + "crates/raft/Cargo.toml" + ], + "tasks": [ + "Define RaftTypeConfig struct", + "Create Request/Response types", + "Define BasicNode type", + "Add openraft dependencies", + "Remove raft-rs dependencies" + ], + "tests": "Compile check, type construction tests", + "risks": "None - foundational work" + }, + "phase_2": { + "name": "Storage Layer Implementation", + "duration": "4-5 hours", + "files": [ + "crates/storage/src/openraft_storage.rs (new)", + "crates/storage/src/lib.rs" + ], + "tasks": [ + "Create OpenRaftMemStorage struct", + "Implement RaftLogReader trait", + "Implement RaftSnapshotBuilder trait", + "Implement RaftStorage trait", + "Map existing MemStorage logic" + ], + "tests": "Port ~50 storage tests to async", + "risks": "Complex trait mapping, ensure idempotency preserved" + }, + "phase_3": { + "name": "State Machine Integration", + "duration": "2-3 hours", + "files": [ + "crates/raft/src/state_machine_wrapper.rs (new)", + "crates/raft/src/state_machine.rs" + ], + "tasks": [ + "Create OpenRaftStateMachine wrapper", + "Implement RaftStateMachine trait", + "Preserve idempotency checks", + "Connect to OpenRaftMemStorage" + ], + "tests": "Port ~15 state machine tests", + "risks": "Ensure apply() index checking works correctly" + }, + "phase_4": { + "name": "Network Stub", + "duration": "1-2 hours", + "files": [ + "crates/raft/src/network_stub.rs (new)" + ], + "tasks": [ + "Create StubNetwork struct", + "Implement RaftNetwork trait with stubs", + "Add TODO comments for gRPC integration", + "Configure for single-node testing" + ], + "tests": "Basic network instantiation tests", + "risks": "None - just stubs" + }, + "phase_5": { + "name": "RaftNode Migration", + "duration": "4-5 hours", + "files": [ + "crates/raft/src/node.rs", + "crates/raft/src/lib.rs" + ], + "tasks": [ + "Replace RawNode with openraft::Raft", + "Update new() to create Raft instance", + "Migrate propose() to client_write()", + "Update is_leader()/leader_id()", + "Remove tick() and handle_ready()" + ], + "tests": "Port ~35 node tests to async", + "risks": "API breaking changes, async migration complexity" + }, + "phase_6": { + "name": "Integration and Cleanup", + "duration": "2-3 hours", + "files": [ + "crates/raft/tests/integration.rs", + "crates/*/src/lib.rs" + ], + "tasks": [ + "Fix compilation errors across crates", + "Update integration tests", + "Remove unused imports/code", + "Update documentation", + "Verify test coverage" + ], + "tests": "Full test suite run", + "risks": "Hidden dependencies on raft-rs behavior" + } + }, + + "risk_mitigation": { + "async_complexity": { + "risk": "Converting sync code to async increases complexity", + "mitigation": "Use tokio::runtime::Handle for sync contexts, block_on sparingly" + }, + "api_compatibility": { + "risk": "Breaking changes affect dependent crates", + "mitigation": "Create compatibility layer for critical APIs during transition" + }, + "test_coverage": { + "risk": "Lost test coverage during migration", + "mitigation": "Track test count per phase, ensure 85+ tests remain" + }, + "idempotency": { + "risk": "Loss of idempotency checks in state machine", + "mitigation": "Keep existing StateMachine::apply() logic unchanged" + }, + "performance": { + "risk": "Async overhead impacts performance", + "mitigation": "Profile critical paths, optimize after migration complete" + } + }, + + "validation_criteria": { + "functional": [ + "Single-node cluster starts successfully", + "Proposals are accepted and applied", + "State machine maintains consistency", + "Get operations return correct values", + "85+ tests pass" + ], + "technical": [ + "No prost version conflicts", + "Clean compilation with no warnings", + "All crates compile together", + "No runtime panics in tests" + ], + "migration": [ + "MemStorage remains in-memory only", + "No RocksDB dependencies added", + "StateMachine idempotency preserved", + "Network transport properly stubbed" + ] + }, + + "future_work": { + "description": "Post-migration enhancements", + "items": [ + "Implement actual gRPC transport using existing proto files", + "Add RocksDB storage backend", + "Integrate with KV service layer", + "Add cluster membership management", + "Implement log compaction and snapshots", + "Add metrics and observability" + ] + } +} \ No newline at end of file diff --git a/docs/specs/openraft/design.md b/docs/specs/openraft/design.md new file mode 100644 index 0000000..c4215a6 --- /dev/null +++ b/docs/specs/openraft/design.md @@ -0,0 +1,355 @@ +# Technical Design: OpenRaft Migration + +## Overview + +This migration replaces the existing `raft-rs` implementation with `openraft`, focusing on a simplified, in-memory storage approach for Seshat's distributed consensus layer. The primary goals are: + +- Eliminate prost version conflicts +- Maintain existing in-memory storage semantics +- Provide a clean, async-first implementation +- Preserve existing state machine behavior +- Create a stub network transport for future gRPC integration + +## Architecture + +### System Overview + +The migration shifts from `raft-rs`'s `RawNode` to `openraft::Raft`, introducing: + +- Async-first design +- Cleaner trait-based storage interface +- Built-in leader election and log replication +- Simplified configuration and runtime management + +### Component Architecture + +``` ++-------------------+ +| Client Operations | ++--------+----------+ + | + v ++--------+----------+ +| RaftNode Wrapper | +| (openraft::Raft) | ++--------+----------+ + | + v ++--------+----------+ +| OpenRaftMemStorage| +| (Storage Traits) | ++--------+----------+ + | + v ++--------+----------+ +| StateMachine | +| (Idempotent Apply)| ++-------------------+ +``` + +### Crate Structure + +- `crates/raft/`: Core Raft node and configuration +- `crates/storage/`: In-memory storage implementation +- `crates/common/`: Shared types and utilities + +## Type System Design + +### OpenRaft Type Configuration + +```rust +pub struct RaftTypeConfig; + +impl openraft::RaftTypeConfig for RaftTypeConfig { + type NodeId = u64; + type Node = BasicNode; + type Entry = LogEntry; + type SnapshotData = Vec; + type AsyncRuntime = TokioRuntime; +} +``` + +### Request Type Definition + +The `Request` type wraps serialized operations from the KV/SQL service layers: + +```rust +/// Request wrapper for operations submitted to Raft. +/// +/// This type bridges the service layer (KV/SQL) and the Raft layer by +/// wrapping serialized Operation bytes in a protobuf-compatible format. +#[derive(Debug, Clone, prost::Message)] +pub struct Request { + /// Serialized Operation from KV or SQL service. + /// + /// Format: protobuf-encoded Operation (e.g., Operation::Set, Operation::Del) + /// The Raft layer treats this as opaque bytes - deserialization happens + /// in the StateMachine during apply(). + #[prost(bytes = "vec", tag = "1")] + pub operation_bytes: Vec, +} + +impl Request { + /// Create a new Request from serialized operation bytes. + pub fn new(operation_bytes: Vec) -> Self { + Self { operation_bytes } + } +} + +// Conversion from KV Service Operation to Request +impl From for Request { + fn from(op: Operation) -> Self { + Request { + operation_bytes: op.encode_to_vec(), + } + } +} +``` + +**Type Hierarchy:** + +``` +KV Service Operation (e.g., Operation::Set { key, value }) + ↓ (serialize with prost) +Request { operation_bytes: Vec } + ↓ (wrap in LogEntry) +LogEntry { log_id, data: Request } + ↓ (serialize with prost) +Vec (stored in RocksDB via storage crate) +``` + +### Type Conversions + +| raft-rs Type | openraft Type | Conversion Strategy | +|--------------|--------------|---------------------| +| `eraftpb::Entry` | `LogEntry` | Create with `log_id` and `data` | +| `eraftpb::HardState` | `Vote` + `LogId` | Extract term, node_id, commit index | +| `eraftpb::ConfState` | `Membership` | Convert voters/learners to `BTreeSet` | + +## Component Specifications + +### Storage Layer (OpenRaftMemStorage) + +Implements three critical openraft storage traits: +- `RaftLogReader`: Read log entries and vote state +- `RaftSnapshotBuilder`: Create snapshots +- `RaftStorage`: Mutation and state tracking + +**Key Traits Implementation**: + +```rust +struct OpenRaftMemStorage { + vote: RwLock>>, + log: RwLock>>, + snapshot: RwLock>>, + state_machine: RwLock, + membership: RwLock> +} +``` + +### State Machine Wrapper + +```rust +struct OpenRaftStateMachine { + inner: Arc> +} + +impl RaftStateMachine for OpenRaftStateMachine { + async fn apply(&mut self, entries: &[LogEntry]) -> Result> { + let mut responses = Vec::new(); + for entry in entries { + let data = &entry.payload.data; + let result = self.inner.write().unwrap() + .apply(entry.log_id.index, data)?; + responses.push(Response { result }); + } + Ok(responses) + } +} +``` + +### Raft Node Migration + +**Key Changes**: +- Async methods +- `client_write()` replaces `propose()` +- Removed `tick()` and `handle_ready()` +- Direct state machine access + +```rust +struct RaftNode { + raft: Raft, + storage: Arc +} + +impl RaftNode { + async fn propose(&self, data: Vec) -> Result<()> { + let request = ClientWriteRequest::new(Request { data }); + self.raft.client_write(request).await + } + + async fn is_leader(&self) -> bool { + self.raft.is_leader().await + } +} +``` + +### Network Stub + +A placeholder implementation for future gRPC transport: + +```rust +struct StubNetwork { + node_id: u64 +} + +#[async_trait] +impl RaftNetwork for StubNetwork { + async fn send_append_entries(&self, _req: AppendEntriesRequest) + -> Result { + Ok(Default::default()) + } + // Similar stubs for vote and snapshot +} +``` + +## Error Handling + +### Error Type Mapping + +OpenRaft errors must be mapped to application-level errors for proper handling in the KV/SQL service layers: + +```rust +use openraft::error::{ClientWriteError, RaftError, StorageError}; + +/// Raft layer error type +#[derive(Debug, thiserror::Error)] +pub enum RaftError { + #[error("Not the leader (current leader: {leader_id:?})")] + NotLeader { leader_id: Option }, + + #[error("No quorum available")] + NoQuorum, + + #[error("Storage error: {0}")] + Storage(#[from] StorageError), + + #[error("Network error: {0}")] + Network(String), + + #[error("Serialization error: {0}")] + Serialization(String), + + #[error("Deserialization error: {0}")] + Deserialization(String), + + #[error("OpenRaft error: {0}")] + OpenRaft(String), +} + +/// Convert OpenRaft ClientWriteError to RaftError +impl From> for RaftError { + fn from(err: ClientWriteError) -> Self { + match err { + ClientWriteError::ForwardToLeader(forward) => { + RaftError::NotLeader { + leader_id: forward.leader_id, + } + } + ClientWriteError::ChangeMembershipError(e) => { + RaftError::OpenRaft(e.to_string()) + } + } + } +} + +/// Convert OpenRaft RaftError to our RaftError +impl From> for RaftError { + fn from(err: openraft::error::RaftError) -> Self { + RaftError::OpenRaft(err.to_string()) + } +} +``` + +### Error Propagation Chain + +``` +OpenRaft Error Types + ↓ (map in raft crate) +RaftError (raft crate) + ↓ (map in service layer) +KvServiceError (kv crate) + ↓ (format) +RespValue::Error (protocol-resp crate) + ↓ (encode) +"-(error) ERR message\r\n" (client) +``` + +**Example Error Flow:** + +1. Client sends SET command +2. RaftNode::propose() calls raft.client_write() +3. OpenRaft returns ClientWriteError::ForwardToLeader { leader_id: Some(2) } +4. Converted to RaftError::NotLeader { leader_id: Some(2) } +5. KV Service maps to KvServiceError::NotLeader(2) +6. Formatted as RespValue::Error("-MOVED 2\r\n") +7. Client receives MOVED error with leader ID + +## Dependencies + +**Added**: +- `openraft = "0.10"` +- `async-trait = "0.1"` +- `tracing = "0.1"` + +**Removed**: +- `raft = "0.7"` +- `prost-old = "0.11"` +- `slog` + +## Implementation Phases + +| Phase | Description | Effort | Risks | +|-------|-------------|--------|-------| +| 1: Type System | Define RaftTypeConfig, types | 2-3h | Low | +| 2: Storage Layer | Implement storage traits | 4-5h | Medium | +| 3: State Machine | Wrap and integrate | 2-3h | Medium | +| 4: Network Stub | Create placeholder transport | 1-2h | None | +| 5: RaftNode Migration | Update node wrapper | 4-5h | High | +| 6: Integration | Cleanup and test | 2-3h | Medium | + +**Total Estimated Effort**: 15-21 hours + +## Risks & Mitigations + +| Risk | Impact | Mitigation | +|------|--------|------------| +| Async Complexity | Medium | Use `tokio::runtime::Handle` | +| API Breaking Changes | High | Create compatibility layer | +| Test Coverage | Medium | Track and maintain 85+ tests | +| Idempotency Loss | High | Preserve existing `apply()` logic | +| Performance | Low | Profile after migration | + +## Success Criteria + +- [ ] No prost version conflicts +- [ ] 85+ tests passing +- [ ] Single-node cluster functional +- [ ] MemStorage remains in-memory +- [ ] Idempotent state machine behavior +- [ ] Clean, async-first implementation + +## Future Work + +- Full gRPC transport implementation +- RocksDB storage backend +- KV service layer integration +- Cluster membership management +- Log compaction and snapshots + +--- + +**Created:** 2025-10-26 +**Feature:** OpenRaft Migration +**Status:** Ready for Implementation +**Estimated Effort:** 15-21 hours \ No newline at end of file diff --git a/docs/specs/openraft/plan.json b/docs/specs/openraft/plan.json new file mode 100644 index 0000000..c399ecb --- /dev/null +++ b/docs/specs/openraft/plan.json @@ -0,0 +1,949 @@ +{ + "feature": "openraft-migration", + "description": "Migration from raft-rs to openraft library", + "type": "library_migration", + "estimated_effort_hours": "15-21", + "total_tasks": 24, + "total_phases": 6, + + "phases": { + "type_system": { + "name": "Type System & Configuration", + "description": "Define openraft type configuration and conversion utilities", + "order": 1, + "dependencies": [], + "estimated_hours": "2-3", + "tasks": ["define_type_config", "create_conversions", "test_conversions"], + "files_affected": [ + "crates/raft/src/types.rs", + "crates/raft/Cargo.toml" + ], + "success_criteria": [ + "RaftTypeConfig compiles with all required associated types", + "Conversion functions for Entry, HardState, ConfState work correctly", + "Property tests verify round-trip conversions" + ] + }, + + "storage_layer": { + "name": "Storage Layer Migration", + "description": "Adapt MemStorage to openraft storage traits", + "order": 2, + "dependencies": ["type_system"], + "estimated_hours": "4-5", + "tasks": [ + "implement_log_reader", + "implement_snapshot_builder", + "implement_storage_trait", + "migrate_storage_tests" + ], + "files_affected": [ + "crates/storage/src/openraft_storage.rs", + "crates/storage/src/lib.rs", + "crates/storage/Cargo.toml" + ], + "success_criteria": [ + "RaftLogReader trait fully implemented", + "RaftSnapshotBuilder creates snapshots correctly", + "RaftStorage trait handles all mutations", + "85+ storage tests migrated and passing" + ] + }, + + "state_machine": { + "name": "State Machine Integration", + "description": "Wrap existing StateMachine with openraft trait", + "order": 3, + "dependencies": ["type_system"], + "estimated_hours": "2-3", + "tasks": [ + "create_wrapper", + "implement_apply", + "implement_snapshot", + "test_idempotency" + ], + "files_affected": [ + "crates/raft/src/state_machine_wrapper.rs", + "crates/raft/src/state_machine.rs" + ], + "success_criteria": [ + "OpenRaftStateMachine wrapper delegates correctly", + "apply() preserves idempotency checks (index > last_applied)", + "Snapshot creation/restoration works with bincode", + "All idempotency tests pass" + ] + }, + + "network_stub": { + "name": "Network Stub Implementation", + "description": "Create minimal RaftNetwork implementation for testing", + "order": 4, + "dependencies": ["type_system"], + "estimated_hours": "1-2", + "tasks": [ + "define_stub_network", + "implement_send_methods", + "test_stub" + ], + "files_affected": [ + "crates/raft/src/network_stub.rs" + ], + "success_criteria": [ + "StubNetwork implements RaftNetwork trait", + "All send methods return Ok() with logging", + "Network instantiation tests pass" + ] + }, + + "node_migration": { + "name": "RaftNode Migration", + "description": "Migrate from raft::RawNode to openraft::Raft", + "order": 5, + "dependencies": ["storage_layer", "state_machine", "network_stub"], + "estimated_hours": "4-5", + "tasks": [ + "update_dependencies", + "migrate_initialization", + "migrate_propose", + "migrate_api", + "migrate_node_tests" + ], + "files_affected": [ + "crates/raft/src/node.rs", + "crates/raft/src/lib.rs", + "crates/raft/Cargo.toml" + ], + "success_criteria": [ + "No prost version conflicts (cargo tree shows single prost 0.14)", + "RaftNode initializes with openraft::Raft", + "propose() works as async client_write()", + "is_leader(), leader_id() use openraft APIs", + "All node tests migrated and passing" + ] + }, + + "integration": { + "name": "Integration & Cleanup", + "description": "Final testing, verification, and code cleanup", + "order": 6, + "dependencies": ["node_migration"], + "estimated_hours": "2-3", + "tasks": [ + "integration_tests", + "verify_prost_conflict_resolved", + "cleanup_old_code", + "update_docs" + ], + "files_affected": [ + "crates/raft/tests/integration_tests.rs", + "crates/raft/README.md", + "crates/storage/README.md" + ], + "success_criteria": [ + "End-to-end integration tests pass", + "cargo tree shows no prost conflicts", + "No raft-rs references remain in codebase", + "Documentation updated to reflect openraft" + ] + } + }, + + "tasks": { + "define_type_config": { + "id": "type_system_1", + "name": "Define RaftTypeConfig", + "phase": "type_system", + "order": 1, + "estimated_hours": "0.5-1", + "description": "Define RaftTypeConfig struct with all required associated types for openraft", + "files": ["crates/raft/src/types.rs"], + "dependencies": [], + "tdd_steps": [ + "Write test for NodeId type (should be u64)", + "Write test for BasicNode struct construction", + "Write test for Request/Response types with serde", + "Implement RaftTypeConfig with all associated types", + "Verify compilation and type constraints" + ], + "acceptance_criteria": [ + "RaftTypeConfig implements openraft::RaftTypeConfig", + "All associated types compile correctly", + "Type construction tests pass" + ], + "implementation_notes": [ + "NodeId = u64 (matches existing raft-rs node IDs)", + "Node = BasicNode { addr: String }", + "Entry = LogEntry", + "SnapshotData = Vec", + "AsyncRuntime = TokioRuntime" + ] + }, + + "create_conversions": { + "id": "type_system_2", + "name": "Create Type Conversions", + "phase": "type_system", + "order": 2, + "estimated_hours": "1-1.5", + "description": "Create conversion functions between eraftpb and openraft types", + "files": ["crates/raft/src/types.rs"], + "dependencies": ["define_type_config"], + "tdd_steps": [ + "Write test for eraftpb::Entry → LogEntry conversion", + "Write test for eraftpb::HardState → Vote + LogId conversion", + "Write test for eraftpb::ConfState → Membership conversion", + "Implement From/Into traits for all conversions", + "Test edge cases (empty voters, max term values)" + ], + "acceptance_criteria": [ + "Entry conversion preserves index, term, data", + "HardState splits into Vote and commit index correctly", + "ConfState converts voters/learners to BTreeSet", + "All conversion tests pass" + ], + "implementation_notes": [ + "Use LogEntry::new(log_id, Request { data })", + "Extract Vote { term, node_id } from HardState", + "Map ConfState.voters/learners to Membership" + ] + }, + + "test_conversions": { + "id": "type_system_3", + "name": "Property Test Conversions", + "phase": "type_system", + "order": 3, + "estimated_hours": "0.5-1", + "description": "Add property tests for round-trip type conversions", + "files": ["crates/raft/src/types.rs"], + "dependencies": ["create_conversions"], + "tdd_steps": [ + "Add proptest dependency to Cargo.toml", + "Write property test for Entry round-trip (openraft → eraftpb → openraft)", + "Write property test for HardState/Vote round-trip", + "Write property test for ConfState/Membership round-trip", + "Verify no data loss in conversions" + ], + "acceptance_criteria": [ + "Property tests pass for 1000+ random inputs", + "Round-trip conversions preserve all data", + "Edge cases handled (empty sets, u64::MAX)" + ], + "implementation_notes": [ + "Use proptest for generating random valid types", + "Test boundary values (0, u64::MAX)", + "Verify no panics on malformed data" + ] + }, + + "implement_log_reader": { + "id": "storage_layer_1", + "name": "Implement RaftLogReader", + "phase": "storage_layer", + "order": 1, + "estimated_hours": "1.5-2", + "description": "Implement RaftLogReader trait for MemStorage to read log entries and vote state", + "files": ["crates/storage/src/openraft_storage.rs", "crates/storage/src/lib.rs"], + "dependencies": ["define_type_config"], + "tdd_steps": [ + "Write test for get_log_state() returning last_purged and last_log_id", + "Write test for try_get_log_entries() with range queries", + "Write test for read_vote() returning current vote state", + "Implement OpenRaftMemStorage struct with RwLock fields", + "Implement RaftLogReader trait methods", + "Test concurrent read access" + ], + "acceptance_criteria": [ + "get_log_state() returns correct LogState", + "try_get_log_entries() handles ranges correctly", + "read_vote() returns None initially, Some(vote) after save", + "Concurrent reads don't deadlock" + ], + "implementation_notes": [ + "Use RwLock>> for log", + "Calculate log state from BTreeMap keys/values", + "Use RwLock>> for vote storage" + ] + }, + + "implement_snapshot_builder": { + "id": "storage_layer_2", + "name": "Implement RaftSnapshotBuilder", + "phase": "storage_layer", + "order": 2, + "estimated_hours": "1-1.5", + "description": "Implement RaftSnapshotBuilder trait for creating snapshots", + "files": ["crates/storage/src/openraft_storage.rs"], + "dependencies": ["implement_log_reader"], + "tdd_steps": [ + "Write test for build_snapshot() creating valid Snapshot", + "Write test verifying snapshot includes state machine data", + "Write test for snapshot metadata (last_log_id, membership)", + "Implement build_snapshot() delegating to StateMachine::snapshot()", + "Wrap result in openraft Snapshot type", + "Test snapshot data integrity with bincode" + ], + "acceptance_criteria": [ + "build_snapshot() creates Snapshot with correct metadata", + "Snapshot data contains serialized state machine", + "Snapshot can be deserialized correctly", + "Multiple snapshots work correctly" + ], + "implementation_notes": [ + "Call self.state_machine.read().unwrap().snapshot()", + "Create SnapshotMeta with last_log_id and membership", + "Store snapshot in RwLock>>" + ] + }, + + "implement_storage_trait": { + "id": "storage_layer_3", + "name": "Implement RaftStorage Trait", + "phase": "storage_layer", + "order": 3, + "estimated_hours": "2-2.5", + "description": "Implement main RaftStorage trait with all mutation methods", + "files": ["crates/storage/src/openraft_storage.rs"], + "dependencies": ["implement_snapshot_builder"], + "tdd_steps": [ + "Write test for save_vote() persisting vote", + "Write test for append() adding entries to log", + "Write test for delete_conflict_logs_since() removing entries", + "Write test for purge_logs_upto() truncating old entries", + "Write test for apply_to_state_machine() applying entries", + "Write test for install_snapshot() restoring state", + "Implement all RaftStorage methods", + "Test atomicity of operations" + ], + "acceptance_criteria": [ + "save_vote() persists vote correctly", + "append() maintains log order", + "delete_conflict_logs_since() removes correct range", + "purge_logs_upto() keeps required entries", + "apply_to_state_machine() preserves idempotency", + "install_snapshot() restores state correctly" + ], + "implementation_notes": [ + "Maintain idempotency check: index > last_applied", + "Use BTreeMap::split_off for efficient range operations", + "Delegate state machine apply to StateMachine::apply()", + "Handle snapshot restoration via StateMachine::restore()" + ] + }, + + "migrate_storage_tests": { + "id": "storage_layer_4", + "name": "Migrate Storage Tests", + "phase": "storage_layer", + "order": 4, + "estimated_hours": "1-1.5", + "description": "Migrate existing 85+ MemStorage tests to openraft API", + "files": ["crates/storage/src/lib.rs"], + "dependencies": ["implement_storage_trait"], + "tdd_steps": [ + "Convert all sync tests to async using #[tokio::test]", + "Update MemStorage API calls to OpenRaftMemStorage", + "Replace raft::Storage trait calls with openraft traits", + "Update assertions for openraft types", + "Verify all 85+ tests pass" + ], + "acceptance_criteria": [ + "All storage tests converted to async", + "85+ tests passing with openraft", + "Test coverage maintained or improved", + "No flaky tests due to async timing" + ], + "implementation_notes": [ + "Use tokio::test macro for async tests", + "Update test helpers to be async fn", + "Replace eraftpb types with openraft types", + "Keep test logic/assertions identical" + ] + }, + + "create_wrapper": { + "id": "state_machine_1", + "name": "Create StateMachine Wrapper", + "phase": "state_machine", + "order": 1, + "estimated_hours": "0.5-1", + "description": "Create OpenRaftStateMachine wrapper around existing StateMachine", + "files": ["crates/raft/src/state_machine_wrapper.rs"], + "dependencies": ["define_type_config"], + "tdd_steps": [ + "Write test for OpenRaftStateMachine initialization", + "Write test for wrapper holding Arc>", + "Create OpenRaftStateMachine struct", + "Implement basic delegation methods", + "Test wrapper compiles and links correctly" + ], + "acceptance_criteria": [ + "OpenRaftStateMachine wraps existing StateMachine", + "Wrapper uses Arc> for thread safety", + "Initialization test passes", + "Compiles without errors" + ], + "implementation_notes": [ + "Store inner: Arc>", + "Prepare for async RaftStateMachine trait impl", + "Keep existing StateMachine untouched" + ] + }, + + "implement_apply": { + "id": "state_machine_2", + "name": "Implement apply() with Idempotency", + "phase": "state_machine", + "order": 2, + "estimated_hours": "1-1.5", + "description": "Implement RaftStateMachine::apply() delegating to StateMachine with idempotency preservation", + "files": ["crates/raft/src/state_machine_wrapper.rs"], + "dependencies": ["create_wrapper"], + "tdd_steps": [ + "Write test verifying apply() rejects entries with index <= last_applied", + "Write test for apply() accepting entries with index > last_applied", + "Write test for apply() processing multiple entries in order", + "Implement apply() iterating over entries and calling StateMachine::apply()", + "Verify idempotency check preserved (delegated to StateMachine)", + "Test response collection and error handling" + ], + "acceptance_criteria": [ + "apply() preserves idempotency (index > last_applied)", + "Entries applied in order", + "Responses collected correctly", + "Out-of-order entries rejected", + "Duplicate entries rejected" + ], + "implementation_notes": [ + "Iterate: for entry in entries { ... }", + "Call self.inner.write().unwrap().apply(entry.log_id.index, &entry.payload.data)", + "Idempotency check is inside StateMachine::apply()", + "Collect Response { result } for each entry" + ] + }, + + "implement_snapshot": { + "id": "state_machine_3", + "name": "Implement Snapshot Methods", + "phase": "state_machine", + "order": 3, + "estimated_hours": "0.5-1", + "description": "Implement snapshot creation and restoration for state machine", + "files": ["crates/raft/src/state_machine_wrapper.rs"], + "dependencies": ["implement_apply"], + "tdd_steps": [ + "Write test for get_current_snapshot() creating snapshot", + "Write test for install_snapshot() restoring state", + "Write test for round-trip snapshot/restore", + "Implement snapshot creation via StateMachine::snapshot()", + "Implement snapshot restoration via StateMachine::restore()", + "Test with bincode serialization" + ], + "acceptance_criteria": [ + "get_current_snapshot() creates valid snapshot", + "install_snapshot() restores state correctly", + "Round-trip preserves all state machine data", + "Bincode serialization works correctly" + ], + "implementation_notes": [ + "snapshot() returns self.inner.read().unwrap().snapshot()", + "restore() calls self.inner.write().unwrap().restore(snapshot)", + "Use existing bincode serialization from StateMachine" + ] + }, + + "test_idempotency": { + "id": "state_machine_4", + "name": "Comprehensive Idempotency Tests", + "phase": "state_machine", + "order": 4, + "estimated_hours": "0.5-1", + "description": "Add comprehensive tests verifying idempotency guarantees", + "files": ["crates/raft/src/state_machine_wrapper.rs"], + "dependencies": ["implement_snapshot"], + "tdd_steps": [ + "Write test applying same entry twice (should reject second)", + "Write test applying entries out of order (should reject)", + "Write test for gap in indices (should accept after gap)", + "Write test verifying last_applied tracking", + "Test idempotency after snapshot restoration" + ], + "acceptance_criteria": [ + "Duplicate entries rejected", + "Out-of-order entries rejected", + "last_applied tracked correctly", + "Idempotency preserved after snapshot restore", + "All idempotency guarantees verified" + ], + "implementation_notes": [ + "Test with sequential indices: 1, 2, 3", + "Test duplicate: 1, 2, 2 (reject third)", + "Test out-of-order: 1, 3, 2 (reject third)", + "Verify StateMachine::apply() logic enforces this" + ] + }, + + "define_stub_network": { + "id": "network_stub_1", + "name": "Define StubNetwork Struct", + "phase": "network_stub", + "order": 1, + "estimated_hours": "0.25-0.5", + "description": "Define StubNetwork struct for RaftNetwork trait implementation", + "files": ["crates/raft/src/network_stub.rs"], + "dependencies": ["define_type_config"], + "tdd_steps": [ + "Write test for StubNetwork creation", + "Create StubNetwork struct with node_id field", + "Add new() constructor", + "Add basic tracing instrumentation" + ], + "acceptance_criteria": [ + "StubNetwork compiles", + "new() constructor works", + "Basic logging in place" + ], + "implementation_notes": [ + "Simple struct: { node_id: u64 }", + "Add tracing::info in new()", + "Prepare for async RaftNetwork trait" + ] + }, + + "implement_send_methods": { + "id": "network_stub_2", + "name": "Implement RaftNetwork Trait", + "phase": "network_stub", + "order": 2, + "estimated_hours": "0.5-1", + "description": "Implement RaftNetwork trait with no-op methods returning Ok", + "files": ["crates/raft/src/network_stub.rs"], + "dependencies": ["define_stub_network"], + "tdd_steps": [ + "Write test for send_append_entries() returning Ok", + "Write test for send_vote() returning Ok", + "Write test for send_install_snapshot() returning Ok", + "Implement RaftNetwork trait with #[async_trait]", + "Add tracing to each method showing it's a stub", + "Return Ok(Default::default()) for all methods" + ], + "acceptance_criteria": [ + "RaftNetwork trait implemented", + "All methods return Ok without panic", + "Tracing shows stub calls", + "Tests verify no-op behavior" + ], + "implementation_notes": [ + "Use #[async_trait] for trait implementation", + "Log at debug level: tracing::debug!(\"StubNetwork: ...\")", + "Return Ok(AppendEntriesResponse::default()), etc.", + "Add TODO comments for future gRPC integration" + ] + }, + + "test_stub": { + "id": "network_stub_3", + "name": "Test Stub Network", + "phase": "network_stub", + "order": 3, + "estimated_hours": "0.25-0.5", + "description": "Test stub network behavior and tracing", + "files": ["crates/raft/src/network_stub.rs"], + "dependencies": ["implement_send_methods"], + "tdd_steps": [ + "Write test verifying no panics on send calls", + "Write test checking tracing output (using tracing-subscriber-test)", + "Write test for concurrent send calls", + "Verify all network methods callable" + ], + "acceptance_criteria": [ + "No panics on any send method", + "Tracing output verified", + "Concurrent calls work", + "All tests pass" + ], + "implementation_notes": [ + "Use tokio::test for async tests", + "Verify Ok() responses", + "Check tracing with subscriber test utilities" + ] + }, + + "update_dependencies": { + "id": "node_migration_1", + "name": "Update Cargo Dependencies", + "phase": "node_migration", + "order": 1, + "estimated_hours": "0.5-1", + "description": "Update Cargo.toml to remove raft-rs and add openraft", + "files": ["crates/raft/Cargo.toml", "crates/storage/Cargo.toml"], + "dependencies": ["storage_layer", "state_machine", "network_stub"], + "tdd_steps": [ + "Remove raft = \"0.7\" dependency", + "Remove prost-old dependency", + "Remove slog dependency", + "Add openraft = { version = \"0.10\", features = [\"tokio\"] }", + "Add async-trait = \"0.1\"", + "Add tracing = \"0.1\"", + "Run cargo tree | grep prost to verify conflict resolved", + "Run cargo build to verify compilation" + ], + "acceptance_criteria": [ + "cargo tree shows single prost version (0.14)", + "No prost version conflicts", + "cargo build succeeds", + "All dependencies compatible" + ], + "implementation_notes": [ + "Keep tokio, serde, bincode, tonic (0.14), prost (0.14)", + "Remove all raft-rs related dependencies", + "Verify openraft uses prost 0.14 (matching tonic 0.14)" + ] + }, + + "migrate_initialization": { + "id": "node_migration_2", + "name": "Migrate RaftNode Initialization", + "phase": "node_migration", + "order": 2, + "estimated_hours": "1-1.5", + "description": "Migrate RaftNode::new() to use openraft::Raft", + "files": ["crates/raft/src/node.rs"], + "dependencies": ["update_dependencies"], + "tdd_steps": [ + "Write async test for RaftNode::new() initialization", + "Update RaftNode struct to hold openraft::Raft", + "Create openraft Config with election/heartbeat timeouts", + "Implement async new() creating Raft instance", + "Test initialization with single node", + "Test initialization with multiple peers" + ], + "acceptance_criteria": [ + "RaftNode::new() is async", + "Creates openraft::Raft instance successfully", + "Config parameters match existing values", + "Initialization tests pass" + ], + "implementation_notes": [ + "Config { election_timeout_min: 150, election_timeout_max: 300, heartbeat_interval: 50 }", + "Use Raft::new(id, Arc::new(config), network, storage).await?", + "Store raft: Raft in struct" + ] + }, + + "migrate_propose": { + "id": "node_migration_3", + "name": "Migrate propose() to client_write()", + "phase": "node_migration", + "order": 3, + "estimated_hours": "1-1.5", + "description": "Migrate propose() method to use openraft client_write()", + "files": ["crates/raft/src/node.rs"], + "dependencies": ["migrate_initialization"], + "tdd_steps": [ + "Write async test for propose() submitting request", + "Write async test for propose() handling response", + "Update propose() to be async fn", + "Implement using raft.client_write(ClientWriteRequest::new(Request { data }))", + "Test successful proposal", + "Test proposal on non-leader (should fail or forward)" + ], + "acceptance_criteria": [ + "propose() is async", + "Uses client_write() correctly", + "Returns result properly", + "Tests verify leader handling" + ], + "implementation_notes": [ + "Signature: async fn propose(&self, data: Vec) -> Result<()>", + "Create request: ClientWriteRequest::new(Request { data })", + "Call: self.raft.client_write(request).await?", + "Handle ClientWriteResponse" + ] + }, + + "migrate_api": { + "id": "node_migration_4", + "name": "Migrate Remaining API Methods", + "phase": "node_migration", + "order": 4, + "estimated_hours": "1-1.5", + "description": "Migrate is_leader(), leader_id(), get() to openraft APIs", + "files": ["crates/raft/src/node.rs"], + "dependencies": ["migrate_propose"], + "tdd_steps": [ + "Write async test for is_leader() using metrics()", + "Write async test for leader_id() using current_leader()", + "Write test for get() direct state machine access", + "Implement is_leader() via self.raft.is_leader().await", + "Implement leader_id() via self.raft.current_leader().await", + "Update get() to access storage.state_machine directly", + "Remove tick() and handle_ready() methods (no longer needed)" + ], + "acceptance_criteria": [ + "is_leader() works correctly", + "leader_id() returns correct node ID or None", + "get() reads from state machine", + "tick() and handle_ready() removed", + "All API tests pass" + ], + "implementation_notes": [ + "is_leader(): self.raft.is_leader().await", + "leader_id(): self.raft.current_leader().await", + "get(): self.storage.state_machine.read().unwrap().get(key)", + "Remove tick/handle_ready logic - openraft handles internally" + ] + }, + + "migrate_node_tests": { + "id": "node_migration_5", + "name": "Migrate RaftNode Tests", + "phase": "node_migration", + "order": 5, + "estimated_hours": "1-1.5", + "description": "Migrate existing RaftNode tests to async openraft API", + "files": ["crates/raft/src/node.rs"], + "dependencies": ["migrate_api"], + "tdd_steps": [ + "Convert all node tests to async using #[tokio::test]", + "Remove tests for tick() and handle_ready()", + "Update propose tests to use client_write", + "Update leader election tests for openraft behavior", + "Fix any timing-related test issues", + "Verify all remaining tests pass" + ], + "acceptance_criteria": [ + "All node tests converted to async", + "Obsolete tests removed (tick, handle_ready)", + "propose → client_write tests working", + "All tests pass consistently" + ], + "implementation_notes": [ + "Use #[tokio::test] macro", + "Add .await to all async calls", + "Update test helpers to async fn", + "Remove synchronous tick/ready loop tests" + ] + }, + + "integration_tests": { + "id": "integration_1", + "name": "End-to-End Integration Tests", + "phase": "integration", + "order": 1, + "estimated_hours": "1-1.5", + "description": "Create comprehensive integration tests for full flow", + "files": ["crates/raft/tests/integration_tests.rs"], + "dependencies": ["migrate_node_tests"], + "tdd_steps": [ + "Write test for full propose → apply → get flow", + "Write test for snapshot creation and restoration", + "Write test for idempotency end-to-end", + "Write test for multiple proposals in sequence", + "Test state machine consistency after operations", + "Test error handling paths" + ], + "acceptance_criteria": [ + "Full flow tests pass", + "Snapshot round-trip works", + "Idempotency verified end-to-end", + "Error cases handled correctly", + "Integration tests stable and repeatable" + ], + "implementation_notes": [ + "Create test helpers: setup_test_node(), propose_and_verify()", + "Test with realistic data patterns", + "Verify state machine state matches expectations", + "Test concurrent operations if possible" + ] + }, + + "verify_prost_conflict_resolved": { + "id": "integration_2", + "name": "Verify Prost Conflict Resolved", + "phase": "integration", + "order": 2, + "estimated_hours": "0.25-0.5", + "description": "Verify that prost version conflict is completely resolved", + "files": [], + "dependencies": ["integration_tests"], + "tdd_steps": [ + "Run cargo tree | grep prost", + "Verify only prost 0.14 appears in tree", + "Check tonic compatibility (should use prost 0.14)", + "Verify openraft compatibility (should use prost 0.14)", + "Run cargo build --all-features to verify", + "Check for any warning about multiple prost versions" + ], + "acceptance_criteria": [ + "cargo tree shows single prost version (0.14)", + "No version conflict warnings", + "All dependencies use same prost version", + "Clean build with no conflicts" + ], + "implementation_notes": [ + "Document prost version in plan", + "Verify with: cargo tree | grep prost | sort | uniq", + "Check openraft's prost dependency matches tonic's" + ] + }, + + "cleanup_old_code": { + "id": "integration_3", + "name": "Remove raft-rs Code", + "phase": "integration", + "order": 3, + "estimated_hours": "0.5-1", + "description": "Remove all raft-rs specific code and references", + "files": ["crates/raft/src/", "crates/storage/src/"], + "dependencies": ["verify_prost_conflict_resolved"], + "tdd_steps": [ + "Search codebase for 'use raft::' imports", + "Remove old raft::Storage trait implementation", + "Remove eraftpb imports and conversions (if any remain)", + "Remove slog-related code", + "Search for RawNode references", + "Remove any dead code from migration", + "Run cargo clippy to find unused imports" + ], + "acceptance_criteria": [ + "No raft-rs references in code", + "No eraftpb imports", + "No slog imports", + "No unused imports or dead code", + "cargo clippy passes cleanly" + ], + "implementation_notes": [ + "Search: rg 'use raft::' --type rust", + "Search: rg 'eraftpb' --type rust", + "Search: rg 'RawNode' --type rust", + "Remove old MemStorage raft::Storage impl if separate file" + ] + }, + + "update_docs": { + "id": "integration_4", + "name": "Update Documentation", + "phase": "integration", + "order": 4, + "estimated_hours": "0.5-1", + "description": "Update all documentation to reflect openraft migration", + "files": [ + "crates/raft/README.md", + "crates/storage/README.md", + "docs/architecture/crates.md" + ], + "dependencies": ["cleanup_old_code"], + "tdd_steps": [ + "Update crates/raft/README.md to mention openraft", + "Update crates/storage/README.md with OpenRaftMemStorage", + "Update module-level doc comments in lib.rs files", + "Update examples if any exist", + "Update docs/architecture/crates.md if needed", + "Remove references to raft-rs from comments" + ], + "acceptance_criteria": [ + "All README files updated", + "Module docs mention openraft", + "No raft-rs references in docs", + "Examples (if any) work with openraft", + "Architecture docs reflect new structure" + ], + "implementation_notes": [ + "Update dependency list in README", + "Update code examples to show async usage", + "Document breaking changes (async APIs)", + "Note prost conflict resolution" + ] + } + }, + + "dependency_graph": { + "description": "Task dependencies showing execution order", + "parallel_phases": { + "after_type_system": [ + "storage_layer (tasks 4-7)", + "state_machine (tasks 8-11)", + "network_stub (tasks 12-14)" + ], + "note": "Storage, state machine, and network can be developed in parallel after type system is complete" + }, + "critical_path": [ + "type_system (tasks 1-3)", + "storage_layer (tasks 4-7)", + "node_migration (tasks 15-19)", + "integration (tasks 20-24)" + ] + }, + + "validation_checklist": { + "functional": [ + "Single-node cluster starts successfully", + "Proposals accepted and applied via client_write()", + "State machine maintains consistency", + "Get operations return correct values", + "Leader election works automatically", + "85+ tests passing" + ], + "technical": [ + "No prost version conflicts (cargo tree verified)", + "Clean compilation with zero warnings", + "All crates compile together", + "No runtime panics in tests", + "Async runtime stable (no deadlocks)" + ], + "migration_specific": [ + "MemStorage remains in-memory only", + "No RocksDB dependencies added", + "StateMachine idempotency preserved", + "Network transport properly stubbed for future gRPC", + "All raft-rs code removed" + ] + }, + + "risk_mitigation": { + "async_complexity": { + "risk": "Converting sync code to async increases complexity", + "impact": "Medium", + "mitigation": "Use tokio::runtime::Handle for sync contexts, avoid blocking", + "affected_tasks": ["migrate_initialization", "migrate_propose", "migrate_api"] + }, + "api_compatibility": { + "risk": "Breaking changes affect dependent crates", + "impact": "High", + "mitigation": "Document all breaking changes, create migration guide", + "affected_tasks": ["node_migration"] + }, + "test_coverage": { + "risk": "Lost test coverage during migration", + "impact": "Medium", + "mitigation": "Track test count per phase, verify 85+ tests remain", + "affected_tasks": ["migrate_storage_tests", "migrate_node_tests"] + }, + "idempotency": { + "risk": "Loss of idempotency guarantees in state machine", + "impact": "High", + "mitigation": "Keep existing StateMachine::apply() logic unchanged", + "affected_tasks": ["implement_apply", "test_idempotency"] + } + }, + + "success_metrics": { + "completion": { + "tasks_completed": 0, + "tasks_total": 24, + "phases_completed": 0, + "phases_total": 6 + }, + "quality": { + "tests_passing": "TBD (target: 85+)", + "prost_versions": "TBD (target: 1)", + "clippy_warnings": "TBD (target: 0)", + "compilation_errors": "TBD (target: 0)" + }, + "effort": { + "estimated_hours_total": "15-21", + "actual_hours": "TBD" + } + } +} diff --git a/docs/specs/openraft/requirements.json b/docs/specs/openraft/requirements.json new file mode 100644 index 0000000..bac5535 --- /dev/null +++ b/docs/specs/openraft/requirements.json @@ -0,0 +1,29 @@ +{ + "raw_user_story": "As a Seshat developer, I want to migrate from raft-rs to openraft so that we eliminate outdated transitive prost dependencies and gain a better-maintained Raft implementation", + "raw_criteria": [ + "openraft replaces raft-rs removing the outdated transitive prost dependency", + "All existing Raft functionality (leader election, log replication) works with openraft", + "RocksDB storage backend integrates with openraft's storage trait" + ], + "raw_rules": [ + "Must maintain compatibility with existing storage layer (6 RocksDB column families)", + "Must support gRPC transport for inter-node communication", + "Must preserve log compaction and snapshot capabilities" + ], + "raw_scope": { + "included": [ + "Replace raft-rs dependency with openraft", + "Implement openraft storage trait using RocksDB", + "Implement gRPC transport layer for inter-node communication", + "Update raft/ crate implementation", + "Replace in-memory KV with RocksDB-backed storage" + ], + "excluded": [ + "Changes to KV service layer or RESP protocol", + "New Raft features beyond current raft-rs functionality", + "Performance optimizations beyond maintaining current benchmarks", + "Storage schema changes" + ] + }, + "context_notes": "Current implementation uses in-memory KV store. RESP protocol not yet connected to Raft. Need to implement both gRPC inter-node communication and RocksDB persistence as part of migration." +} diff --git a/docs/specs/openraft/spec-lite.md b/docs/specs/openraft/spec-lite.md new file mode 100644 index 0000000..f79882f --- /dev/null +++ b/docs/specs/openraft/spec-lite.md @@ -0,0 +1,87 @@ +# OpenRaft Migration - Specification Summary + +## Overview +Replace raft-rs 0.7 with openraft to eliminate prost dependency conflicts (0.11→0.14) and modernize to a better-maintained Raft implementation. Keep MemStorage (in-memory), stub network transport, no KV integration yet. + +## User Story +As a **Seshat developer**, I want to **migrate from raft-rs to openraft** so that **we eliminate prost dependency conflicts and gain a better-maintained Raft implementation with cleaner APIs**. + +## Key Acceptance Criteria +1. **Dependency Resolution** - Eliminate prost 0.11, unify on prost 0.14 throughout codebase +2. **Storage Integration** - MemStorage works with openraft storage traits (in-memory only) +3. **State Machine** - Operations applied in correct order with strong consistency +4. **Test Migration** - All existing unit tests pass with openraft + +## Critical Business Rules +1. Eliminate prost version conflicts (primary motivation) +2. Keep MemStorage in-memory design (no RocksDB) +3. Stub network transport for future gRPC +4. No KV service integration in this phase +5. Maintain tracing/observability patterns + +## Scope + +**Included:** +- Replace raft-rs dependency with openraft +- Adapt MemStorage to implement openraft storage traits (in-memory) +- Migrate state machine to openraft API +- Update RaftNode wrapper (raft::RawNode → openraft::Raft) +- Remove prost 0.11, standardize on prost 0.14 +- Migrate unit tests to openraft +- Stub RaftNetwork trait for future gRPC + +**Excluded:** +- RocksDB persistent storage (future phase) +- Full gRPC transport implementation (future phase) +- KV service integration (future phase) +- RESP protocol integration (future phase) +- Integration/chaos tests (future phase) +- Performance benchmarking (future phase) +- Cluster formation modes (future phase) + +## Major Technical Changes + +### Interfaces +- **openraft storage traits** - Implement for MemStorage (in-memory) +- **openraft::RaftStateMachine** - Apply operations to in-memory state +- **openraft::RaftNetwork** - Stub for future gRPC transport +- **RaftNode wrapper** - Migrate from raft::RawNode to openraft::Raft + +### Integration Points +- raft → storage (MemStorage for in-memory log/state) +- raft → common (shared types: NodeId, Error) + +### Implementation Phases (10-15 hours) +1. Dependency replacement (1-2h) +2. MemStorage adaptation to openraft storage traits (2-3h) +3. RaftStateMachine trait implementation (2-3h) +4. Stub RaftNetwork (1h) +5. RaftNode wrapper migration (2-3h) +6. Unit test migration (2-3h) + +## Dependencies & Conflicts + +**Dependencies:** +- seshat-storage (MemStorage in-memory) +- seshat-common (NodeId, Error types) +- openraft (external), tokio, tracing + +**Conflicts:** +- raft::RawNode API → openraft::Raft API (wrapper updates) +- Prost 0.11 vs 0.14 (resolved by migration) +- MemStorage needs adaptation to openraft traits +- Test fixtures need migration + +## Success Metrics +- [ ] Zero prost conflicts in `cargo tree` +- [ ] All unit tests pass +- [ ] MemStorage works with openraft +- [ ] RaftNode wrapper functional +- [ ] Code compiles without raft-rs + +## Next Action +Run `/spec:design openraft` to create detailed technical design. + +--- +**Feature:** openraft | **Phase:** 1 (MVP Preparation) | **Priority:** HIGH +**Effort:** 10-15 hours | **Focus:** Dependency swap + MemStorage adaptation diff --git a/docs/specs/openraft/spec.json b/docs/specs/openraft/spec.json new file mode 100644 index 0000000..f329c39 --- /dev/null +++ b/docs/specs/openraft/spec.json @@ -0,0 +1,105 @@ +{ + "feature": "openraft", + "user_story": "As a Seshat developer, I want to migrate from raft-rs to openraft so that we eliminate outdated transitive prost dependencies and gain a better-maintained Raft implementation with cleaner APIs", + "acceptance_criteria": [ + "GIVEN existing raft-rs 0.7 dependency with prost-codec feature WHEN replaced with openraft THEN transitive prost 0.11 dependency is eliminated and unified prost 0.14 is used throughout", + "GIVEN openraft storage trait implementation WHEN integrated with MemStorage backend THEN storage operations (log entries, hard state, snapshots) work correctly in-memory", + "GIVEN openraft state machine implementation WHEN operations are proposed THEN operations are applied in correct order with strong consistency guarantees", + "GIVEN existing unit tests WHEN migrated to openraft THEN all tests pass with equivalent or better coverage" + ], + "business_rules": [ + "Must eliminate prost version conflicts between raft library (0.11) and transport layer (0.14) - this is the primary motivation for the migration", + "Must maintain existing MemStorage in-memory design (no persistent storage implementation)", + "Must maintain existing logging and observability patterns using tracing crate with structured logging", + "Transport layer should have stub/placeholder implementation for future gRPC integration", + "No integration with KV service layer in this phase - focus on core migration only" + ], + "scope": { + "included": [ + "Replace raft-rs 0.7 dependency with openraft in Cargo.toml (workspace-level change)", + "Implement openraft storage traits using existing MemStorage (in-memory)", + "Migrate state machine from raft-rs RawNode API to openraft API", + "Update RaftNode wrapper to use openraft::Raft instead of raft::RawNode", + "Remove prost 0.11 dependency and standardize on prost 0.14 throughout codebase", + "Update all unit tests in raft crate to work with openraft APIs", + "Add stub/placeholder for network transport (RaftNetwork trait)", + "Add tracing instrumentation for openraft operations (leader election, log replication)" + ], + "excluded": [ + "RocksDB persistent storage implementation (future phase)", + "Full gRPC transport layer implementation (future phase)", + "Connection pooling and retry logic for network transport (future phase)", + "Integration with KV service layer (future phase)", + "RESP protocol integration (future phase)", + "Snapshot creation and restoration with RocksDB checkpoints (future phase)", + "Integration tests for 2-node and 3-node clusters (future phase)", + "Chaos testing implementation (future phase)", + "Performance benchmarking and optimization (future phase)", + "Changes to seshat main binary orchestration (separate task)", + "Bootstrap/join cluster formation modes (future phase)", + "Multi-shard cluster support (Phase 2 feature)", + "Dynamic cluster membership changes (Phase 3 feature)", + "Advanced observability features like OpenTelemetry (Phase 4 feature)", + "SQL interface support (Phase 5 feature)" + ] + }, + "aligns_with": "Phase 1 MVP preparation - eliminates technical debt (prost version conflicts) and modernizes to better-maintained Raft library as foundation for future persistent storage and network transport implementation.", + "dependencies": [ + "storage crate (seshat-storage) - MemStorage in-memory implementation", + "common crate (seshat-common) - shared types (NodeId, Error)", + "openraft crate (external dependency) - Raft consensus library to replace raft-rs", + "tokio 1.x - Async runtime", + "tracing crate - Structured logging for observability" + ], + "conflicts": [ + "Existing raft-rs RawNode API differs from openraft::Raft API - requires wrapper updates in RaftNode", + "Current prost 0.11 dependency (from raft-rs eraftpb) conflicts with tonic 0.14 requiring prost 0.14 - this migration resolves the conflict", + "Test mocks and fixtures using raft-rs types need migration to openraft equivalents", + "MemStorage will need adaptation to openraft's storage trait interface" + ], + "technical_details": { + "interfaces_affected": [ + "openraft storage traits - must be implemented for MemStorage backend", + "openraft::RaftStateMachine trait - applies committed operations to in-memory state", + "openraft::RaftNetwork trait - stub implementation for future gRPC transport", + "RaftNode wrapper struct - changes from raft::RawNode to openraft::Raft", + "Storage trait methods - must map to openraft storage requirements" + ], + "integration_points": [ + "raft crate → storage crate: MemStorage for in-memory log and state storage", + "raft crate → common crate: Use shared types (NodeId, Error) throughout" + ], + "testing_requirements": [ + "Unit tests for openraft storage trait implementation with MemStorage", + "Unit tests for state machine applying operations correctly", + "Unit tests for RaftNode wrapper with openraft::Raft", + "Property tests for entry serialization/deserialization round-trips" + ], + "data_migrations": [ + "No data migrations required - keeping in-memory MemStorage", + "MemStorage API needs adaptation to openraft traits" + ], + "observability_requirements": [ + "Add tracing spans for leader election with node_id and term fields", + "Add tracing spans for log replication with entry count and commit index", + "Log state machine operations at DEBUG level with operation type", + "Use tracing::instrument macro on key RaftNode methods", + "Ensure all errors include context for debugging (use thiserror with context)" + ], + "implementation_phases": [ + "Phase 1: Replace raft-rs dependency, update Cargo.toml, resolve prost conflicts (1-2 hours)", + "Phase 2: Adapt MemStorage to implement openraft storage traits (2-3 hours)", + "Phase 3: Implement openraft::RaftStateMachine trait for in-memory operations (2-3 hours)", + "Phase 4: Create stub RaftNetwork implementation (1 hour)", + "Phase 5: Update RaftNode wrapper to use openraft::Raft API (2-3 hours)", + "Phase 6: Migrate unit tests to openraft equivalents, ensure all pass (2-3 hours)", + "Total estimated effort: 10-15 hours" + ], + "risk_mitigation": [ + "Risk: openraft API significantly different from raft-rs → Mitigation: Review openraft examples and docs before implementation, create prototype wrapper", + "Risk: MemStorage incompatible with openraft traits → Mitigation: Study openraft storage trait requirements, adapt incrementally with tests", + "Risk: Prost version conflicts persist → Mitigation: Verify openraft uses prost 0.12+ and is compatible with tonic 0.14", + "Risk: Tests fail after migration → Mitigation: Migrate tests incrementally, maintain test coverage throughout" + ] + } +} diff --git a/docs/specs/openraft/spec.md b/docs/specs/openraft/spec.md new file mode 100644 index 0000000..852d244 --- /dev/null +++ b/docs/specs/openraft/spec.md @@ -0,0 +1,167 @@ +# Feature Specification: OpenRaft Migration + +## Overview + +Migrate Seshat's consensus layer from `raft-rs 0.7` to `openraft` to eliminate transitive prost dependency conflicts (0.11 vs 0.14) and gain a better-maintained Raft implementation with cleaner trait APIs. + +**Current State:** Phase 1 has MemStorage implementation complete (storage layer) using raft-rs with in-memory storage. RaftNode wrapper and StateMachine exist but use synchronous raft-rs APIs. RESP protocol (100% complete) not yet connected to Raft. + +**Target State:** Fully functional openraft-based consensus with MemStorage (in-memory), stub inter-node communication, and no integration with KV service layer. + +## User Story + +As a **Seshat developer**, I want to **migrate from raft-rs to openraft** so that **we eliminate outdated transitive prost dependencies and gain a better-maintained Raft implementation with cleaner APIs**. + +## Acceptance Criteria + +- [ ] **AC1: Dependency Resolution** - GIVEN existing raft-rs 0.7 dependency with prost-codec feature WHEN replaced with openraft THEN transitive prost 0.11 dependency is eliminated and unified prost 0.14 is used throughout +- [ ] **AC2: Storage Integration** - GIVEN openraft storage trait implementation WHEN integrated with MemStorage backend THEN storage operations (log entries, hard state, snapshots) work correctly in-memory +- [ ] **AC3: State Machine Operations** - GIVEN openraft state machine implementation WHEN operations are proposed THEN operations are applied in correct order with strong consistency guarantees +- [ ] **AC4: Test Migration** - GIVEN existing unit tests WHEN migrated to openraft THEN all tests pass with equivalent or better coverage + +## Business Rules + +1. **Primary Migration Motivation** - Must eliminate prost version conflicts between raft library (0.11) and transport layer (0.14) +2. **Storage Design** - Must maintain existing MemStorage in-memory design (no persistent storage implementation) +3. **Serialization Strategy** - Use protobuf (prost) for all serialization (storage + network) - single format throughout, same as network layer +4. **Observability Continuity** - Must maintain existing logging and observability patterns using tracing crate with structured logging +5. **Transport Stub** - Transport layer should have stub/placeholder implementation for future gRPC integration +6. **No KV Integration** - No integration with KV service layer in this phase - focus on core migration only + +## Scope + +### Included + +1. Replace raft-rs 0.7 dependency with openraft in Cargo.toml (workspace-level change) +2. Implement openraft storage traits using existing MemStorage (in-memory) +3. Migrate state machine from raft-rs RawNode API to openraft API +4. Update RaftNode wrapper to use `openraft::Raft` instead of `raft::RawNode` +5. Remove prost 0.11 dependency and standardize on prost 0.14 throughout codebase +6. **Define protobuf schemas** for storage types (LogEntry, HardState, Snapshot metadata) - use prost for encoding/decoding +7. **Remove bincode dependency** entirely - replaced by protobuf for all serialization +8. Update all unit tests in raft crate to work with openraft APIs +9. Add stub/placeholder for network transport (RaftNetwork trait) +10. Add tracing instrumentation for openraft operations (leader election, log replication) + +### Excluded + +1. RocksDB persistent storage implementation (future phase) +2. Full gRPC transport layer implementation (future phase) +3. Connection pooling and retry logic for network transport (future phase) +4. Integration with KV service layer (future phase) +5. RESP protocol integration (future phase) +6. Snapshot creation and restoration with RocksDB checkpoints (future phase) +7. Integration tests for 2-node and 3-node clusters (future phase) +8. Chaos testing implementation (future phase) +9. Performance benchmarking and optimization (future phase) +10. Changes to seshat main binary orchestration (separate task) +11. Bootstrap/join cluster formation modes (future phase) +12. Multi-shard cluster support (Phase 2 feature) +13. Dynamic cluster membership changes (Phase 3 feature) +14. Advanced observability features like OpenTelemetry (Phase 4 feature) +15. SQL interface support (Phase 5 feature) + +## Technical Details + +### Interfaces Affected + +1. **openraft storage traits** - Must be implemented for MemStorage backend (in-memory) +2. **`openraft::RaftStateMachine` trait** - Applies committed operations to in-memory state +3. **`openraft::RaftNetwork` trait** - Stub implementation for future gRPC transport +4. **`RaftNode` wrapper struct** - Changes from `raft::RawNode` to `openraft::Raft` +5. **Storage trait methods** - Must map to openraft storage requirements + +### Integration Points + +1. **raft crate → storage crate** - MemStorage for in-memory log and state storage +2. **raft crate → common crate** - Use shared types (NodeId, Error) throughout + +### Testing Requirements + +1. Unit tests for openraft storage trait implementation with MemStorage +2. Unit tests for state machine applying operations correctly +3. Unit tests for RaftNode wrapper with openraft::Raft +4. Property tests for entry serialization/deserialization round-trips + +### Implementation Phases + +| Phase | Description | Estimated Time | +|-------|-------------|----------------| +| 1 | Replace raft-rs dependency, update Cargo.toml, resolve prost conflicts | 1-2 hours | +| 2 | Adapt MemStorage to implement openraft storage traits | 2-3 hours | +| 3 | Implement `openraft::RaftStateMachine` trait for in-memory operations | 2-3 hours | +| 4 | Create stub RaftNetwork implementation | 1 hour | +| 5 | Update RaftNode wrapper to use `openraft::Raft` API | 2-3 hours | +| 6 | Migrate unit tests to openraft equivalents, ensure all pass | 2-3 hours | +| **Total** | | **10-15 hours** | + +### Risk Mitigation + +| Risk | Mitigation | +|------|------------| +| openraft API significantly different from raft-rs | Review openraft examples and docs before implementation, create prototype wrapper | +| MemStorage incompatible with openraft traits | Study openraft storage trait requirements, adapt incrementally with tests | +| Prost version conflicts persist | Verify openraft uses prost 0.12+ and is compatible with tonic 0.14 | +| Tests fail after migration | Migrate tests incrementally, maintain test coverage throughout | + +### Observability Requirements + +- Add tracing spans for leader election with node_id and term fields +- Add tracing spans for log replication with entry count and commit index +- Log state machine operations at DEBUG level with operation type +- Use `tracing::instrument` macro on key RaftNode methods +- Ensure all errors include context for debugging (use `thiserror` with context) + +## Dependencies + +1. **seshat-storage** - MemStorage in-memory implementation +2. **seshat-common** - Shared types (NodeId, Error) +3. **openraft** (external) - Raft consensus library to replace raft-rs +4. **prost 0.14** - Protobuf serialization for storage and network (unified format) +5. **tokio 1.x** - Async runtime +6. **tracing** - Structured logging for observability + +**Dependencies to REMOVE:** +- **bincode** - Replaced by protobuf for all serialization +- **raft-rs 0.7** - Replaced by openraft + +## Conflicts & Resolution + +| Conflict | Resolution | +|----------|-----------| +| raft-rs RawNode API differs from openraft::Raft API | Update RaftNode wrapper to adapt between openraft and existing interfaces | +| Prost 0.11 (raft-rs) conflicts with prost 0.14 (tonic) | Migration to openraft eliminates this conflict - openraft uses compatible prost version | +| Test mocks using raft-rs types | Migrate to openraft equivalents, may require new test fixtures | +| MemStorage needs adaptation | Implement openraft storage traits for MemStorage | + +## Alignment + +This feature aligns with **Phase 1 MVP preparation** by eliminating technical debt (prost version conflicts) and modernizing to a better-maintained Raft library. This establishes the foundation for future persistent storage and network transport implementation. + +**Addresses immediate technical debt:** Prost version conflicts blocking modern dependency usage + +**Establishes foundation for:** RocksDB persistence (future), gRPC transport (future), KV integration (future), Phase 2+ features + +## Success Metrics + +- [ ] Zero prost dependency conflicts in `cargo tree` +- [ ] All existing raft unit tests pass with openraft +- [ ] MemStorage works correctly with openraft storage traits +- [ ] RaftNode wrapper functions with openraft::Raft +- [ ] Code compiles without raft-rs dependency + +## Next Steps + +1. **Review this specification** - Ensure simplified scope is aligned with goals +2. **Create technical design** - Run `/spec:design openraft` to generate detailed architecture +3. **Generate implementation tasks** - Run `/spec:plan openraft` to break down work into dependency-ordered tasks +4. **Begin implementation** - Run `/spec:implement openraft` to start TDD-based development +5. **Track progress** - Use `/spec:progress openraft` to monitor task completion + +--- + +**Created:** 2025-10-25 +**Feature:** openraft +**Phase:** 1 (MVP Preparation) +**Priority:** HIGH (eliminates technical debt) +**Estimated Effort:** 10-15 hours diff --git a/docs/specs/openraft/tasks.md b/docs/specs/openraft/tasks.md new file mode 100644 index 0000000..ded4cf5 --- /dev/null +++ b/docs/specs/openraft/tasks.md @@ -0,0 +1,995 @@ +# Implementation Tasks: OpenRaft Migration + +## Overview + +This migration replaces `raft-rs` with `openraft` in the Seshat distributed key-value store. The implementation is **in-memory only** - no RocksDB integration, no full gRPC transport implementation. The network layer will be a stub for future development. + +**Scope:** +- Migrate from `raft-rs` (0.7) to `openraft` (0.10) +- Resolve prost version conflict (0.12 vs 0.14) +- Preserve existing StateMachine idempotency guarantees +- Maintain 85+ test coverage +- Convert synchronous API to async + +**Estimated Effort:** +- Single-agent: 15-21 hours +- Multi-agent (3 agents): 12-16 hours with parallel execution + +**Total:** 6 phases, 24 tasks + +--- + +## Execution Strategy + +### Critical Path +The minimum sequential path through the migration: + +1. **Phase 1: Type System** (2-3 hours) - Foundation for all other work +2. **Phase 2: Storage Layer** (4-5 hours) - Required by Node Migration +3. **Phase 5: Node Migration** (4-5 hours) - Core migration work +4. **Phase 6: Integration** (2-3 hours) - Final validation and cleanup + +**Total critical path:** 12-16 hours + +### Parallel Execution +After Phase 1 completes, three independent tracks can run concurrently: + +**Agent 1: Storage Layer** (Phase 2) +- Task 2.1-2.4: Implement RaftLogReader, RaftSnapshotBuilder, RaftStorage +- Duration: 4-5 hours + +**Agent 2: State Machine** (Phase 3) +- Task 3.1-3.4: Wrap StateMachine, implement apply(), snapshot methods +- Duration: 2-3 hours + +**Agent 3: Network Stub** (Phase 4) +- Task 4.1-4.3: Create minimal RaftNetwork implementation +- Duration: 1-2 hours + +After these converge, all agents work on Phase 5 (Node Migration) and Phase 6 (Integration). + +### Multi-Agent Workflow +**Optimal 3-agent approach:** + +``` +Hour 0-3: All → Phase 1 (Type System) +Hour 3-8: Agent 1 → Phase 2 (Storage) + Agent 2 → Phase 3 (State Machine) + Agent 3 → Phase 4 (Network) → Wait for Phase 2/3 +Hour 8-13: All → Phase 5 (Node Migration) +Hour 13-16: All → Phase 6 (Integration) +``` + +This reduces total time from ~18 hours (sequential) to ~13-16 hours (parallel). + +--- + +## Phases + +### Phase 1: Type System & Configuration (2-3 hours) +**Dependencies:** None (start here!) +**Can run in parallel with:** Nothing (foundation for all other phases) + +#### Task 1.1: Define RaftTypeConfig +**ID:** type_system_1 +**Estimated Time:** 0.5-1 hour + +- [ ] **Test**: Write test for NodeId type (should be u64) +- [ ] **Test**: Write test for BasicNode struct construction +- [ ] **Test**: Write test for Request/Response types with serde +- [ ] **Implement**: Create RaftTypeConfig struct with all associated types +- [ ] **Refactor**: Verify compilation and type constraints + +**Files:** `crates/raft/src/types.rs`, `crates/raft/Cargo.toml` + +**Acceptance:** +- RaftTypeConfig implements openraft::RaftTypeConfig +- All associated types compile correctly (NodeId=u64, Node=BasicNode, etc.) +- Type construction tests pass + +**Notes:** +- NodeId = u64 (matches existing raft-rs) +- Node = BasicNode { addr: String } +- Entry = LogEntry\ +- SnapshotData = Vec\ +- AsyncRuntime = TokioRuntime + +--- + +#### Task 1.2: Create Type Conversions +**ID:** type_system_2 +**Estimated Time:** 1-1.5 hours + +- [ ] **Test**: Write test for eraftpb::Entry → LogEntry\ conversion +- [ ] **Test**: Write test for eraftpb::HardState → Vote + LogId conversion +- [ ] **Test**: Write test for eraftpb::ConfState → Membership conversion +- [ ] **Implement**: Implement From/Into traits for all conversions +- [ ] **Refactor**: Test edge cases (empty voters, max term values) + +**Files:** `crates/raft/src/types.rs` + +**Acceptance:** +- Entry conversion preserves index, term, data +- HardState splits into Vote and commit index correctly +- ConfState converts voters/learners to BTreeSet +- All conversion tests pass + +**Notes:** +- Use LogEntry::new(log_id, Request { data }) +- Extract Vote { term, node_id } from HardState +- Map ConfState.voters/learners to Membership + +--- + +#### Task 1.3: Property Test Conversions +**ID:** type_system_3 +**Estimated Time:** 0.5-1 hour + +- [ ] **Test**: Add proptest dependency to Cargo.toml +- [ ] **Test**: Write property test for Entry round-trip (openraft → eraftpb → openraft) +- [ ] **Test**: Write property test for HardState/Vote round-trip +- [ ] **Test**: Write property test for ConfState/Membership round-trip +- [ ] **Refactor**: Verify no data loss in conversions + +**Files:** `crates/raft/src/types.rs` + +**Acceptance:** +- Property tests pass for 1000+ random inputs +- Round-trip conversions preserve all data +- Edge cases handled (empty sets, u64::MAX) + +**Notes:** +- Use proptest for generating random valid types +- Test boundary values (0, u64::MAX) +- Verify no panics on malformed data + +--- + +### Phase 2: Storage Layer Migration (4-5 hours) +**Dependencies:** Phase 1 (Type System) +**Can run in parallel with:** Phase 3 (State Machine), Phase 4 (Network Stub) + +#### Task 2.1: Implement RaftLogReader +**ID:** storage_layer_1 +**Estimated Time:** 1.5-2 hours + +- [ ] **Test**: Write test for get_log_state() returning last_purged and last_log_id +- [ ] **Test**: Write test for try_get_log_entries() with range queries +- [ ] **Test**: Write test for read_vote() returning current vote state +- [ ] **Implement**: Create OpenRaftMemStorage struct with RwLock fields +- [ ] **Implement**: Implement RaftLogReader trait methods +- [ ] **Refactor**: Test concurrent read access + +**Files:** `crates/storage/src/openraft_storage.rs`, `crates/storage/src/lib.rs` + +**Acceptance:** +- get_log_state() returns correct LogState +- try_get_log_entries() handles ranges correctly +- read_vote() returns None initially, Some(vote) after save +- Concurrent reads don't deadlock + +**Notes:** +- Use RwLock\\>\> for log +- Calculate log state from BTreeMap keys/values +- Use RwLock\\>\> for vote storage + +--- + +#### Task 2.2: Implement RaftSnapshotBuilder +**ID:** storage_layer_2 +**Estimated Time:** 1-1.5 hours + +- [ ] **Test**: Write test for build_snapshot() creating valid Snapshot +- [ ] **Test**: Write test verifying snapshot includes state machine data +- [ ] **Test**: Write test for snapshot metadata (last_log_id, membership) +- [ ] **Implement**: Implement build_snapshot() delegating to StateMachine::snapshot() +- [ ] **Implement**: Wrap result in openraft Snapshot type +- [ ] **Refactor**: Test snapshot data integrity with bincode + +**Files:** `crates/storage/src/openraft_storage.rs` + +**Acceptance:** +- build_snapshot() creates Snapshot with correct metadata +- Snapshot data contains serialized state machine +- Snapshot can be deserialized correctly +- Multiple snapshots work correctly + +**Notes:** +- Call self.state_machine.read().unwrap().snapshot() +- Create SnapshotMeta with last_log_id and membership +- Store snapshot in RwLock\\>\> + +--- + +#### Task 2.3: Implement RaftStorage Trait +**ID:** storage_layer_3 +**Estimated Time:** 2-2.5 hours + +- [ ] **Test**: Write test for save_vote() persisting vote +- [ ] **Test**: Write test for append() adding entries to log +- [ ] **Test**: Write test for delete_conflict_logs_since() removing entries +- [ ] **Test**: Write test for purge_logs_upto() truncating old entries +- [ ] **Test**: Write test for apply_to_state_machine() applying entries +- [ ] **Test**: Write test for install_snapshot() restoring state +- [ ] **Implement**: Implement all RaftStorage methods +- [ ] **Refactor**: Test atomicity of operations + +**Files:** `crates/storage/src/openraft_storage.rs` + +**Acceptance:** +- save_vote() persists vote correctly +- append() maintains log order +- delete_conflict_logs_since() removes correct range +- purge_logs_upto() keeps required entries +- apply_to_state_machine() preserves idempotency +- install_snapshot() restores state correctly + +**Notes:** +- Maintain idempotency check: index > last_applied +- Use BTreeMap::split_off for efficient range operations +- Delegate state machine apply to StateMachine::apply() +- Handle snapshot restoration via StateMachine::restore() + +--- + +#### Task 2.4: Migrate Storage Tests +**ID:** storage_layer_4 +**Estimated Time:** 1-1.5 hours + +- [ ] **Test**: Convert all sync tests to async using #[tokio::test] +- [ ] **Test**: Update MemStorage API calls to OpenRaftMemStorage +- [ ] **Test**: Replace raft::Storage trait calls with openraft traits +- [ ] **Test**: Update assertions for openraft types +- [ ] **Refactor**: Verify all 85+ tests pass + +**Files:** `crates/storage/src/lib.rs` + +**Acceptance:** +- All storage tests converted to async +- 85+ tests passing with openraft +- Test coverage maintained or improved +- No flaky tests due to async timing + +**Notes:** +- Use tokio::test macro for async tests +- Update test helpers to be async fn +- Replace eraftpb types with openraft types +- Keep test logic/assertions identical + +--- + +### Phase 3: State Machine Integration (2-3 hours) +**Dependencies:** Phase 1 (Type System) +**Can run in parallel with:** Phase 2 (Storage Layer), Phase 4 (Network Stub) + +#### Task 3.1: Create StateMachine Wrapper +**ID:** state_machine_1 +**Estimated Time:** 0.5-1 hour + +- [ ] **Test**: Write test for OpenRaftStateMachine initialization +- [ ] **Test**: Write test for wrapper holding Arc\\> +- [ ] **Implement**: Create OpenRaftStateMachine struct +- [ ] **Implement**: Implement basic delegation methods +- [ ] **Refactor**: Test wrapper compiles and links correctly + +**Files:** `crates/raft/src/state_machine_wrapper.rs` + +**Acceptance:** +- OpenRaftStateMachine wraps existing StateMachine +- Wrapper uses Arc\\> for thread safety +- Initialization test passes +- Compiles without errors + +**Notes:** +- Store inner: Arc\\> +- Prepare for async RaftStateMachine trait impl +- Keep existing StateMachine untouched + +--- + +#### Task 3.2: Implement apply() with Idempotency +**ID:** state_machine_2 +**Estimated Time:** 1-1.5 hours + +- [ ] **Test**: Write test verifying apply() rejects entries with index <= last_applied +- [ ] **Test**: Write test for apply() accepting entries with index > last_applied +- [ ] **Test**: Write test for apply() processing multiple entries in order +- [ ] **Implement**: Implement apply() iterating over entries and calling StateMachine::apply() +- [ ] **Implement**: Verify idempotency check preserved (delegated to StateMachine) +- [ ] **Refactor**: Test response collection and error handling + +**Files:** `crates/raft/src/state_machine_wrapper.rs` + +**Acceptance:** +- apply() preserves idempotency (index > last_applied) +- Entries applied in order +- Responses collected correctly +- Out-of-order entries rejected +- Duplicate entries rejected + +**Notes:** +- Iterate: for entry in entries { ... } +- Call self.inner.write().unwrap().apply(entry.log_id.index, &entry.payload.data) +- Idempotency check is inside StateMachine::apply() +- Collect Response { result } for each entry + +--- + +#### Task 3.3: Implement Snapshot Methods +**ID:** state_machine_3 +**Estimated Time:** 0.5-1 hour + +- [ ] **Test**: Write test for get_current_snapshot() creating snapshot +- [ ] **Test**: Write test for install_snapshot() restoring state +- [ ] **Test**: Write test for round-trip snapshot/restore +- [ ] **Implement**: Implement snapshot creation via StateMachine::snapshot() +- [ ] **Implement**: Implement snapshot restoration via StateMachine::restore() +- [ ] **Refactor**: Test with bincode serialization + +**Files:** `crates/raft/src/state_machine_wrapper.rs` + +**Acceptance:** +- get_current_snapshot() creates valid snapshot +- install_snapshot() restores state correctly +- Round-trip preserves all state machine data +- Bincode serialization works correctly + +**Notes:** +- snapshot() returns self.inner.read().unwrap().snapshot() +- restore() calls self.inner.write().unwrap().restore(snapshot) +- Use existing bincode serialization from StateMachine + +--- + +#### Task 3.4: Comprehensive Idempotency Tests +**ID:** state_machine_4 +**Estimated Time:** 0.5-1 hour + +- [ ] **Test**: Write test applying same entry twice (should reject second) +- [ ] **Test**: Write test applying entries out of order (should reject) +- [ ] **Test**: Write test for gap in indices (should accept after gap) +- [ ] **Test**: Write test verifying last_applied tracking +- [ ] **Test**: Test idempotency after snapshot restoration + +**Files:** `crates/raft/src/state_machine_wrapper.rs` + +**Acceptance:** +- Duplicate entries rejected +- Out-of-order entries rejected +- last_applied tracked correctly +- Idempotency preserved after snapshot restore +- All idempotency guarantees verified + +**Notes:** +- Test with sequential indices: 1, 2, 3 +- Test duplicate: 1, 2, 2 (reject third) +- Test out-of-order: 1, 3, 2 (reject third) +- Verify StateMachine::apply() logic enforces this + +--- + +### Phase 4: Network Stub Implementation (1-2 hours) +**Dependencies:** Phase 1 (Type System) +**Can run in parallel with:** Phase 2 (Storage Layer), Phase 3 (State Machine) + +#### Task 4.1: Define StubNetwork Struct +**ID:** network_stub_1 +**Estimated Time:** 0.25-0.5 hour + +- [ ] **Test**: Write test for StubNetwork creation +- [ ] **Implement**: Create StubNetwork struct with node_id field +- [ ] **Implement**: Add new() constructor +- [ ] **Refactor**: Add basic tracing instrumentation + +**Files:** `crates/raft/src/network_stub.rs` + +**Acceptance:** +- StubNetwork compiles +- new() constructor works +- Basic logging in place + +**Notes:** +- Simple struct: { node_id: u64 } +- Add tracing::info in new() +- Prepare for async RaftNetwork trait + +--- + +#### Task 4.2: Implement RaftNetwork Trait +**ID:** network_stub_2 +**Estimated Time:** 0.5-1 hour + +- [ ] **Test**: Write test for send_append_entries() returning Ok +- [ ] **Test**: Write test for send_vote() returning Ok +- [ ] **Test**: Write test for send_install_snapshot() returning Ok +- [ ] **Implement**: Implement RaftNetwork trait with #[async_trait] +- [ ] **Implement**: Add tracing to each method showing it's a stub +- [ ] **Refactor**: Return Ok(Default::default()) for all methods + +**Files:** `crates/raft/src/network_stub.rs` + +**Acceptance:** +- RaftNetwork trait implemented +- All methods return Ok without panic +- Tracing shows stub calls +- Tests verify no-op behavior + +**Notes:** +- Use #[async_trait] for trait implementation +- Log at debug level: tracing::debug!("StubNetwork: ...") +- Return Ok(AppendEntriesResponse::default()), etc. +- Add TODO comments for future gRPC integration + +--- + +#### Task 4.3: Test Stub Network +**ID:** network_stub_3 +**Estimated Time:** 0.25-0.5 hour + +- [ ] **Test**: Write test verifying no panics on send calls +- [ ] **Test**: Write test checking tracing output (using tracing-subscriber-test) +- [ ] **Test**: Write test for concurrent send calls +- [ ] **Refactor**: Verify all network methods callable + +**Files:** `crates/raft/src/network_stub.rs` + +**Acceptance:** +- No panics on any send method +- Tracing output verified +- Concurrent calls work +- All tests pass + +**Notes:** +- Use tokio::test for async tests +- Verify Ok() responses +- Check tracing with subscriber test utilities + +--- + +### Phase 5: RaftNode Migration (4-5 hours) +**Dependencies:** Phase 2 (Storage Layer), Phase 3 (State Machine), Phase 4 (Network Stub) +**Can run in parallel with:** Nothing (requires all previous phases) + +#### Task 5.1: Update Cargo Dependencies +**ID:** node_migration_1 +**Estimated Time:** 0.5-1 hour + +- [ ] **Implement**: Remove raft = "0.7" dependency +- [ ] **Implement**: Remove prost-old dependency +- [ ] **Implement**: Remove slog dependency +- [ ] **Implement**: Add openraft = { version = "0.10", features = ["tokio"] } +- [ ] **Implement**: Add async-trait = "0.1" +- [ ] **Implement**: Add tracing = "0.1" +- [ ] **Test**: Run cargo tree | grep prost to verify conflict resolved +- [ ] **Refactor**: Run cargo build to verify compilation + +**Files:** `crates/raft/Cargo.toml`, `crates/storage/Cargo.toml` + +**Acceptance:** +- cargo tree shows single prost version (0.14) +- No prost version conflicts +- cargo build succeeds +- All dependencies compatible + +**Notes:** +- Keep tokio, serde, bincode, tonic (0.14), prost (0.14) +- Remove all raft-rs related dependencies +- Verify openraft uses prost 0.14 (matching tonic 0.14) + +--- + +#### Task 5.2: Migrate RaftNode Initialization +**ID:** node_migration_2 +**Estimated Time:** 1-1.5 hours + +- [ ] **Test**: Write async test for RaftNode::new() initialization +- [ ] **Implement**: Update RaftNode struct to hold openraft::Raft\ +- [ ] **Implement**: Create openraft Config with election/heartbeat timeouts +- [ ] **Implement**: Implement async new() creating Raft instance +- [ ] **Test**: Test initialization with single node +- [ ] **Refactor**: Test initialization with multiple peers + +**Files:** `crates/raft/src/node.rs` + +**Acceptance:** +- RaftNode::new() is async +- Creates openraft::Raft instance successfully +- Config parameters match existing values +- Initialization tests pass + +**Notes:** +- Config { election_timeout_min: 150, election_timeout_max: 300, heartbeat_interval: 50 } +- Use Raft::new(id, Arc::new(config), network, storage).await? +- Store raft: Raft\ in struct + +--- + +#### Task 5.3: Migrate propose() to client_write() +**ID:** node_migration_3 +**Estimated Time:** 1-1.5 hours + +- [ ] **Test**: Write async test for propose() submitting request +- [ ] **Test**: Write async test for propose() handling response +- [ ] **Implement**: Update propose() to be async fn +- [ ] **Implement**: Implement using raft.client_write(ClientWriteRequest::new(Request { data })) +- [ ] **Test**: Test successful proposal +- [ ] **Refactor**: Test proposal on non-leader (should fail or forward) + +**Files:** `crates/raft/src/node.rs` + +**Acceptance:** +- propose() is async +- Uses client_write() correctly +- Returns result properly +- Tests verify leader handling + +**Notes:** +- Signature: async fn propose(&self, data: Vec\) -> Result\<()\> +- Create request: ClientWriteRequest::new(Request { data }) +- Call: self.raft.client_write(request).await? +- Handle ClientWriteResponse + +--- + +#### Task 5.4: Migrate Remaining API Methods +**ID:** node_migration_4 +**Estimated Time:** 1-1.5 hours + +- [ ] **Test**: Write async test for is_leader() using metrics() +- [ ] **Test**: Write async test for leader_id() using current_leader() +- [ ] **Test**: Write test for get() direct state machine access +- [ ] **Implement**: Implement is_leader() via self.raft.is_leader().await +- [ ] **Implement**: Implement leader_id() via self.raft.current_leader().await +- [ ] **Implement**: Update get() to access storage.state_machine directly +- [ ] **Refactor**: Remove tick() and handle_ready() methods (no longer needed) + +**Files:** `crates/raft/src/node.rs` + +**Acceptance:** +- is_leader() works correctly +- leader_id() returns correct node ID or None +- get() reads from state machine +- tick() and handle_ready() removed +- All API tests pass + +**Notes:** +- is_leader(): self.raft.is_leader().await +- leader_id(): self.raft.current_leader().await +- get(): self.storage.state_machine.read().unwrap().get(key) +- Remove tick/handle_ready logic - openraft handles internally + +--- + +#### Task 5.5: Migrate RaftNode Tests +**ID:** node_migration_5 +**Estimated Time:** 1-1.5 hours + +- [ ] **Test**: Convert all node tests to async using #[tokio::test] +- [ ] **Test**: Remove tests for tick() and handle_ready() +- [ ] **Test**: Update propose tests to use client_write +- [ ] **Test**: Update leader election tests for openraft behavior +- [ ] **Test**: Fix any timing-related test issues +- [ ] **Refactor**: Verify all remaining tests pass + +**Files:** `crates/raft/src/node.rs` + +**Acceptance:** +- All node tests converted to async +- Obsolete tests removed (tick, handle_ready) +- propose → client_write tests working +- All tests pass consistently + +**Notes:** +- Use #[tokio::test] macro +- Add .await to all async calls +- Update test helpers to async fn +- Remove synchronous tick/ready loop tests + +--- + +### Phase 6: Integration & Cleanup (2-3 hours) +**Dependencies:** Phase 5 (Node Migration) +**Can run in parallel with:** Nothing (final validation phase) + +#### Task 6.1: End-to-End Integration Tests +**ID:** integration_1 +**Estimated Time:** 1-1.5 hours + +- [ ] **Test**: Write test for full propose → apply → get flow +- [ ] **Test**: Write test for snapshot creation and restoration +- [ ] **Test**: Write test for idempotency end-to-end +- [ ] **Test**: Write test for multiple proposals in sequence +- [ ] **Test**: Test state machine consistency after operations +- [ ] **Refactor**: Test error handling paths + +**Files:** `crates/raft/tests/integration_tests.rs` + +**Acceptance:** +- Full flow tests pass +- Snapshot round-trip works +- Idempotency verified end-to-end +- Error cases handled correctly +- Integration tests stable and repeatable + +**Notes:** +- Create test helpers: setup_test_node(), propose_and_verify() +- Test with realistic data patterns +- Verify state machine state matches expectations +- Test concurrent operations if possible + +--- + +#### Task 6.2: Verify Prost Conflict Resolved +**ID:** integration_2 +**Estimated Time:** 0.25-0.5 hour + +- [ ] **Test**: Run cargo tree | grep prost +- [ ] **Test**: Verify only prost 0.14 appears in tree +- [ ] **Test**: Check tonic compatibility (should use prost 0.14) +- [ ] **Test**: Verify openraft compatibility (should use prost 0.14) +- [ ] **Test**: Run cargo build --all-features to verify +- [ ] **Refactor**: Check for any warning about multiple prost versions + +**Files:** (No files - command line validation) + +**Acceptance:** +- cargo tree shows single prost version (0.14) +- No version conflict warnings +- All dependencies use same prost version +- Clean build with no conflicts + +**Notes:** +- Document prost version in plan +- Verify with: cargo tree | grep prost | sort | uniq +- Check openraft's prost dependency matches tonic's + +--- + +#### Task 6.3: Remove raft-rs Code +**ID:** integration_3 +**Estimated Time:** 0.5-1 hour + +- [ ] **Implement**: Search codebase for 'use raft::' imports +- [ ] **Implement**: Remove old raft::Storage trait implementation +- [ ] **Implement**: Remove eraftpb imports and conversions (if any remain) +- [ ] **Implement**: Remove slog-related code +- [ ] **Implement**: Search for RawNode references +- [ ] **Implement**: Remove any dead code from migration +- [ ] **Refactor**: Run cargo clippy to find unused imports + +**Files:** `crates/raft/src/`, `crates/storage/src/` + +**Acceptance:** +- No raft-rs references in code +- No eraftpb imports +- No slog imports +- No unused imports or dead code +- cargo clippy passes cleanly + +**Notes:** +- Search: rg 'use raft::' --type rust +- Search: rg 'eraftpb' --type rust +- Search: rg 'RawNode' --type rust +- Remove old MemStorage raft::Storage impl if separate file + +--- + +#### Task 6.4: Update Documentation +**ID:** integration_4 +**Estimated Time:** 0.5-1 hour + +- [ ] **Implement**: Update crates/raft/README.md to mention openraft +- [ ] **Implement**: Update crates/storage/README.md with OpenRaftMemStorage +- [ ] **Implement**: Update module-level doc comments in lib.rs files +- [ ] **Implement**: Update examples if any exist +- [ ] **Implement**: Update docs/architecture/crates.md if needed +- [ ] **Refactor**: Remove references to raft-rs from comments + +**Files:** `crates/raft/README.md`, `crates/storage/README.md`, `docs/architecture/crates.md` + +**Acceptance:** +- All README files updated +- Module docs mention openraft +- No raft-rs references in docs +- Examples (if any) work with openraft +- Architecture docs reflect new structure + +**Notes:** +- Update dependency list in README +- Update code examples to show async usage +- Document breaking changes (async APIs) +- Note prost conflict resolution + +--- + +## Progress Tracking + +### Completed Tasks by Phase + +- [ ] **Phase 1: Type System & Configuration** (0/3 complete) + - [ ] Task 1.1: Define RaftTypeConfig + - [ ] Task 1.2: Create Type Conversions + - [ ] Task 1.3: Property Test Conversions + +- [ ] **Phase 2: Storage Layer Migration** (0/4 complete) + - [ ] Task 2.1: Implement RaftLogReader + - [ ] Task 2.2: Implement RaftSnapshotBuilder + - [ ] Task 2.3: Implement RaftStorage Trait + - [ ] Task 2.4: Migrate Storage Tests + +- [ ] **Phase 3: State Machine Integration** (0/4 complete) + - [ ] Task 3.1: Create StateMachine Wrapper + - [ ] Task 3.2: Implement apply() with Idempotency + - [ ] Task 3.3: Implement Snapshot Methods + - [ ] Task 3.4: Comprehensive Idempotency Tests + +- [ ] **Phase 4: Network Stub Implementation** (0/3 complete) + - [ ] Task 4.1: Define StubNetwork Struct + - [ ] Task 4.2: Implement RaftNetwork Trait + - [ ] Task 4.3: Test Stub Network + +- [ ] **Phase 5: RaftNode Migration** (0/5 complete) + - [ ] Task 5.1: Update Cargo Dependencies + - [ ] Task 5.2: Migrate RaftNode Initialization + - [ ] Task 5.3: Migrate propose() to client_write() + - [ ] Task 5.4: Migrate Remaining API Methods + - [ ] Task 5.5: Migrate RaftNode Tests + +- [ ] **Phase 6: Integration & Cleanup** (0/4 complete) + - [ ] Task 6.1: End-to-End Integration Tests + - [ ] Task 6.2: Verify Prost Conflict Resolved + - [ ] Task 6.3: Remove raft-rs Code + - [ ] Task 6.4: Update Documentation + +**Total Progress**: 0/24 tasks (0%) + +### Milestones + +- [ ] **Type system complete** → Foundation for parallel work (Phases 2-4) +- [ ] **Storage layer complete** → 85+ tests passing with openraft +- [ ] **State machine complete** → Idempotency validated end-to-end +- [ ] **Network stub complete** → Ready for future gRPC transport +- [ ] **Node migration complete** → No prost conflicts, all APIs async +- [ ] **Integration complete** → End-to-end tests passing, docs updated + +--- + +## Risk Mitigation + +### High-Risk Tasks + +#### 1. Task 3.2: Implement apply() - Idempotency Preservation +**Risk:** Loss of idempotency guarantees during state machine migration +**Impact:** HIGH - Could allow duplicate entries to corrupt state +**Mitigation:** +- Keep existing StateMachine::apply() logic unchanged +- Wrapper only delegates, doesn't modify behavior +- Add comprehensive idempotency tests before proceeding + +**Validation Gate:** +- All idempotency tests (Task 3.4) must pass before Phase 5 +- Test: duplicate entries rejected +- Test: out-of-order entries rejected +- Test: last_applied tracked correctly + +--- + +#### 2. Task 5.1: Update Dependencies - Prost Conflict Resolution +**Risk:** Prost version conflict (0.12 vs 0.14) blocks compilation +**Impact:** HIGH - Blocks entire Node Migration phase +**Mitigation:** +- Verify openraft 0.10 uses prost 0.14 +- Check tonic 0.14 compatibility +- Run `cargo tree | grep prost` immediately after dependency update + +**Validation Gate:** +- GO/NO-GO decision point: Single prost version in cargo tree +- If conflict persists: investigate openraft version or tonic downgrade +- Must resolve before Task 5.2 (Node Initialization) + +--- + +#### 3. Task 5.4: Migrate API - Public API Changes +**Risk:** Breaking changes to RaftNode API affect dependent crates +**Impact:** MEDIUM - Requires updates in kv/, sql/, seshat/ crates +**Mitigation:** +- Document all async signature changes +- Create migration guide for async API usage +- Update dependent crates in same commit + +**Validation Gate:** +- Integration tests verify API contracts unchanged (semantically) +- All async conversions use proper error handling +- No blocking calls in async contexts + +--- + +#### 4. Task 2.4: Migrate Storage Tests - Test Coverage Loss +**Risk:** Lost test coverage during async migration +**Impact:** MEDIUM - Reduced confidence in storage correctness +**Mitigation:** +- Track test count before/after: must maintain 85+ tests +- Convert tests incrementally, verify each batch passes +- Add new async-specific tests (race conditions, deadlocks) + +**Validation Gate:** +- Minimum 85 tests passing +- No flaky tests due to async timing +- Coverage report shows maintained or improved coverage + +--- + +### Validation Gates Summary + +**After Phase 1:** +- [ ] All type conversions compile +- [ ] Property tests pass (1000+ random inputs) +- [ ] No panics in type conversion tests + +**After Phase 2:** +- [ ] 85+ storage tests passing +- [ ] No async deadlocks or race conditions +- [ ] RaftLogReader, RaftSnapshotBuilder, RaftStorage fully implemented + +**After Phase 3:** +- [ ] All idempotency tests pass +- [ ] StateMachine wrapper preserves existing behavior +- [ ] Snapshot round-trip verified + +**After Phase 4:** +- [ ] Network stub compiles and links +- [ ] All send methods return Ok without panic +- [ ] Ready for future gRPC integration + +**After Phase 5:** +- [ ] Single prost version (0.14) in cargo tree +- [ ] All RaftNode tests passing (async) +- [ ] No raft-rs imports remain + +**After Phase 6:** +- [ ] End-to-end integration tests pass +- [ ] Zero clippy warnings +- [ ] Documentation updated +- [ ] Migration complete + +--- + +## Fast Feedback Loops + +### After Each Task +Run quick unit tests to verify immediate correctness: + +```bash +# Fast feedback (<30 seconds) +cargo test --lib --package raft +cargo test --lib --package storage +``` + +### After Each Phase +Run full test suite to ensure integration correctness: + +```bash +# Full validation (1-2 minutes) +cargo test --all + +# Check for unused imports and dead code +cargo clippy --all-targets +``` + +### After Dependency Updates (Task 5.1) +Critical validation before proceeding: + +```bash +# Verify prost conflict resolution (<5 seconds) +cargo tree | grep prost | sort | uniq + +# Expected output: Single line with prost v0.14.x +# If multiple versions appear: STOP and investigate +``` + +### Continuous Validation +Run on every commit: + +```bash +# Standard validation pipeline +cargo build --all-features +cargo test --all +cargo clippy --all-targets -- -D warnings +cargo fmt -- --check +``` + +--- + +## Next Steps + +### Getting Started + +**1. Begin with Phase 1 (Type System)** +This is the foundation for all other work. No other phase can start until Phase 1 completes. + +```bash +# Command to begin +/spec:implement openraft type_system + +# Or start with first task +/spec:implement openraft 1.1 +``` + +**2. After Phase 1: Launch Parallel Tracks** +Once type system is complete, three agents can work concurrently: + +```bash +# Agent 1: Storage Layer +/spec:implement openraft storage_layer + +# Agent 2: State Machine (parallel) +/spec:implement openraft state_machine + +# Agent 3: Network Stub (parallel) +/spec:implement openraft network_stub +``` + +**3. Converge on Node Migration** +After Phases 2-4 complete, all agents work on Phase 5: + +```bash +# All agents: Node Migration +/spec:implement openraft node_migration +``` + +**4. Final Integration** +Complete with Phase 6 validation and cleanup: + +```bash +# All agents: Integration & Cleanup +/spec:implement openraft integration +``` + +### Tracking Progress + +Update this file as you complete tasks: + +```bash +# After each task, mark as complete +/spec:progress openraft + +# View overall feature progress +/spec:progress openraft verbose +``` + +--- + +## Appendix: Quick Reference + +### Key Files Modified +- `crates/raft/src/types.rs` - Type definitions and conversions +- `crates/storage/src/openraft_storage.rs` - Storage trait implementation +- `crates/raft/src/state_machine_wrapper.rs` - State machine wrapper +- `crates/raft/src/network_stub.rs` - Stub network implementation +- `crates/raft/src/node.rs` - RaftNode migration +- `crates/raft/Cargo.toml` - Dependency updates +- `crates/storage/Cargo.toml` - Dependency updates + +### Critical Dependencies +- openraft = "0.10" (with tokio feature) +- async-trait = "0.1" +- tracing = "0.1" +- tokio (existing, for async runtime) +- prost = "0.14" (must match tonic 0.14) + +### Success Criteria +- [ ] 85+ tests passing +- [ ] Single prost version (0.14) +- [ ] Zero clippy warnings +- [ ] Zero compilation errors +- [ ] All idempotency tests pass +- [ ] End-to-end integration tests pass +- [ ] Documentation updated + +--- + +**Created:** 2025-10-26 +**Feature:** openraft-migration +**Estimated Single-Agent Time:** 15-21 hours +**Estimated Multi-Agent Time:** 12-16 hours (3 agents) +**Current Status:** Ready to begin diff --git a/docs/specs/rocksdb/architectural-solutions.md b/docs/specs/rocksdb/architectural-solutions.md new file mode 100644 index 0000000..b3bf341 --- /dev/null +++ b/docs/specs/rocksdb/architectural-solutions.md @@ -0,0 +1,1490 @@ +# RocksDB Storage Layer: Architectural Solutions + +**Date**: 2025-10-26 +**Author**: Principal Engineer (Architect Agent) +**Purpose**: Address critical architectural issues in RocksDB specification + +--- + +## Executive Summary + +This document provides implementation-ready solutions for two critical architectural issues identified in the RocksDB storage layer specification review: + +1. **Issue 2**: Inconsistent data structure dependencies and unclear serialization boundaries +2. **Issue 3**: Missing atomic log index management and validation logic + +Both issues are resolved with clear crate boundaries, explicit API contracts, and detailed implementation strategies compatible with openraft's storage traits. + +--- + +## Issue 2: Crate Boundary Clarification + +### Problem Statement + +The current design creates ambiguity by: +- Stating storage "stores bytes, no knowledge of data structures" (line 599 in design.md) +- Referencing specific types like `VersionedLogEntry`, `RaftHardState` throughout storage APIs +- Unclear WHERE serialization/deserialization happens +- Mixed responsibilities between storage and raft crates + +### Root Cause Analysis + +The confusion stems from documenting storage operations with high-level type names (e.g., `VersionedLogEntry`) when the storage crate should only deal with raw bytes. This makes it unclear: + +1. Does storage crate depend on `common` crate for type definitions? +2. Who is responsible for serialization: storage or raft? +3. What happens if storage needs to inspect data (e.g., for validation)? + +### Architectural Solution + +#### 1. Strict Crate Boundary Definition + +**The Golden Rule**: Storage crate operates exclusively on `&[u8]` and `Vec`. Zero knowledge of domain types. + +``` +┌─────────────────────────────────────────────────────────────┐ +│ Raft Crate (seshat-raft) │ +│ - Owns: VersionedLogEntry, RaftHardState, SnapshotMetadata │ +│ - Responsibility: Serialize/deserialize using bincode │ +│ - Calls: storage.put(cf, key, &serialized_bytes) │ +└─────────────────────────────────────────────────────────────┘ + │ + │ Pure byte interface + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ Storage Crate (seshat-storage) │ +│ - Owns: Storage, ColumnFamily, WriteBatch, StorageIterator │ +│ - Responsibility: RocksDB operations on bytes │ +│ - NO dependency on common crate types │ +│ - API accepts only: &[u8], Vec │ +└─────────────────────────────────────────────────────────────┘ + │ + │ RocksDB API + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ RocksDB (rocksdb crate) │ +└─────────────────────────────────────────────────────────────┘ +``` + +#### 2. Revised Storage API (Purely Bytes) + +**File**: `crates/storage/src/lib.rs` + +```rust +/// Pure persistence layer - operates only on bytes +pub struct Storage { + db: Arc, + cf_handles: HashMap>>, + config: StorageOptions, +} + +impl Storage { + // ============================================================ + // BASIC OPERATIONS - Pure byte interface + // ============================================================ + + /// Get value by key. Returns None if key doesn't exist. + /// + /// # Arguments + /// * `cf` - Column family to read from + /// * `key` - Raw key bytes + /// + /// # Returns + /// * `Ok(Some(value))` - Key exists, returns value bytes + /// * `Ok(None)` - Key does not exist + /// * `Err(_)` - RocksDB error + pub fn get(&self, cf: ColumnFamily, key: &[u8]) -> Result>>; + + /// Put key-value pair. + /// + /// # Durability + /// If `cf.requires_fsync()` is true (raft_state CFs), this call will + /// not return until data is synced to disk. + pub fn put(&self, cf: ColumnFamily, key: &[u8], value: &[u8]) -> Result<()>; + + /// Delete key. + pub fn delete(&self, cf: ColumnFamily, key: &[u8]) -> Result<()>; + + /// Check if key exists (uses bloom filter optimization). + pub fn exists(&self, cf: ColumnFamily, key: &[u8]) -> Result; + + // ============================================================ + // BATCH OPERATIONS - Atomic multi-operation commits + // ============================================================ + + /// Execute atomic batch write across multiple column families. + /// + /// All operations succeed or all fail. No partial writes visible. + /// If any CF in batch requires fsync, entire batch is synced. + pub fn batch_write(&self, batch: WriteBatch) -> Result<()>; + + // ============================================================ + // LOG OPERATIONS - Sequential index management + // ============================================================ + + /// Append log entry with automatic index validation. + /// + /// # Index Validation + /// - First entry: `index` must be 1 + /// - Subsequent entries: `index` must be `last_index + 1` + /// - Gaps or duplicates return `Err(StorageError::InvalidLogIndex)` + /// + /// # Atomicity + /// Validation and write are NOT atomic. Concurrent appends may cause + /// race conditions. Caller (raft crate) must serialize append calls. + /// + /// # Arguments + /// * `cf` - Log column family (system_raft_log or data_raft_log) + /// * `index` - Log index (must be sequential, starts at 1) + /// * `entry_bytes` - Serialized log entry (caller handles serialization) + /// + /// # Returns + /// * `Ok(())` - Entry appended successfully + /// * `Err(InvalidLogIndex)` - Index validation failed + /// * `Err(RocksDb(_))` - Underlying storage error + pub fn append_log_entry( + &self, + cf: ColumnFamily, + index: u64, + entry_bytes: &[u8], + ) -> Result<()>; + + /// Get range of log entries [start, end) (end exclusive). + /// + /// # Returns + /// Vec of serialized entries in index order. Empty vec if no entries. + /// Missing indices in range are NOT included (returns only existing entries). + pub fn get_log_range( + &self, + cf: ColumnFamily, + start: u64, + end: u64, + ) -> Result>>; + + /// Delete all log entries with index < truncate_index. + /// + /// Used for log compaction after snapshot. Atomic operation. + pub fn truncate_log_before(&self, cf: ColumnFamily, truncate_index: u64) -> Result<()>; + + /// Get highest log index in CF. + /// + /// # Returns + /// * `Ok(Some(index))` - Highest index found + /// * `Ok(None)` - No entries in log (empty log) + pub fn get_last_log_index(&self, cf: ColumnFamily) -> Result>; + + // ============================================================ + // SNAPSHOT OPERATIONS + // ============================================================ + + /// Create RocksDB checkpoint at path. + /// + /// Uses hard links - O(1) time, initially zero additional space. + pub fn create_snapshot(&self, path: &Path) -> Result<()>; + + /// Restore from checkpoint directory. + /// + /// CAUTION: Replaces current DB state. Backup before calling. + pub fn restore_snapshot(&self, path: &Path) -> Result<()>; + + // ============================================================ + // UTILITIES + // ============================================================ + + /// Create iterator for range scans within CF. + pub fn iterator(&self, cf: ColumnFamily, mode: IteratorMode) -> Result; + + /// Force fsync all pending writes. + pub fn sync(&self) -> Result<()>; + + /// Manual compaction for CF in range [start, end). + pub fn compact_range( + &self, + cf: ColumnFamily, + start: Option<&[u8]>, + end: Option<&[u8]>, + ) -> Result<()>; +} +``` + +**Key Changes**: +1. All comments reference "bytes" not specific types +2. `append_log_entry` takes `entry_bytes: &[u8]` not `entry: &VersionedLogEntry` +3. `get_log_range` returns `Vec>` not `Vec` +4. Zero mention of domain types in API surface + +#### 3. Raft Crate Integration Pattern + +**File**: `crates/raft/src/storage_adapter.rs` + +```rust +use seshat_common::{VersionedLogEntry, RaftHardState, SnapshotMetadata}; +use seshat_storage::{Storage, ColumnFamily, WriteBatch, StorageError}; + +/// Adapter that implements openraft storage traits using seshat-storage. +/// +/// This struct OWNS serialization/deserialization logic. +pub struct RaftStorageAdapter { + storage: Storage, + shard_id: u64, +} + +impl RaftStorageAdapter { + pub fn new(storage: Storage, shard_id: u64) -> Self { + Self { storage, shard_id } + } + + /// Select appropriate log CF based on shard_id. + fn log_cf(&self) -> ColumnFamily { + if self.shard_id == 0 { + ColumnFamily::SystemRaftLog + } else { + ColumnFamily::DataRaftLog + } + } + + /// Select appropriate state CF based on shard_id. + fn state_cf(&self) -> ColumnFamily { + if self.shard_id == 0 { + ColumnFamily::SystemRaftState + } else { + ColumnFamily::DataRaftState + } + } + + /// Append log entries with serialization. + /// + /// Serializes entries and delegates to storage layer. + pub async fn append_entries(&self, entries: Vec) -> Result<()> { + let cf = self.log_cf(); + + for entry in entries { + // SERIALIZATION HAPPENS HERE in raft crate using protobuf + let entry_bytes = entry.encode_to_vec(); + + // Storage receives pure bytes + self.storage.append_log_entry(cf, entry.index, &entry_bytes)?; + } + + Ok(()) + } + + /// Get log entries with deserialization. + pub async fn get_entries(&self, start: u64, end: u64) -> Result> { + let cf = self.log_cf(); + + // Storage returns pure bytes + let entry_bytes_vec = self.storage.get_log_range(cf, start, end)?; + + // DESERIALIZATION HAPPENS HERE in raft crate using protobuf + let entries = entry_bytes_vec + .into_iter() + .map(|bytes| { + VersionedLogEntry::decode(&bytes[..]) + .map_err(|e| RaftError::Deserialization(e.to_string())) + }) + .collect::>>()?; + + Ok(entries) + } + + /// Save hard state with serialization. + pub async fn save_hard_state(&self, hard_state: RaftHardState) -> Result<()> { + let cf = self.state_cf(); + + // SERIALIZATION HAPPENS HERE using protobuf + let bytes = hard_state.encode_to_vec(); + + // Storage receives pure bytes, handles fsync automatically + self.storage.put(cf, b"state", &bytes)?; + + Ok(()) + } + + /// Load hard state with deserialization. + pub async fn load_hard_state(&self) -> Result> { + let cf = self.state_cf(); + + // Storage returns pure bytes + let bytes = self.storage.get(cf, b"state")?; + + match bytes { + Some(b) => { + // DESERIALIZATION HAPPENS HERE using protobuf + let state = RaftHardState::decode(&b[..]) + .map_err(|e| RaftError::Deserialization(e.to_string()))?; + Ok(Some(state)) + } + None => Ok(None), + } + } +} +``` + +**Call Flow Example - Log Append**: + +``` +┌────────────────────────────────────────────────────────────────────┐ +│ 1. openraft::Raft │ +│ raft.append_entries(entries: Vec) │ +└────────────────────────────────────────────────────────────────────┘ + │ + │ Calls storage trait method + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ 2. RaftStorageAdapter (raft crate) │ +│ - OWNS: VersionedLogEntry struct definition │ +│ - CONVERTS: Entry → VersionedLogEntry │ +│ - SERIALIZES: bincode::serialize(&entry) → Vec │ +└────────────────────────────────────────────────────────────────────┘ + │ + │ Calls storage API with bytes + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ 3. Storage (storage crate) │ +│ storage.append_log_entry(cf, index, &[u8]) │ +│ - VALIDATES: Sequential index │ +│ - WRITES: RocksDB put with key "log:{index}" │ +│ - NO KNOWLEDGE: of VersionedLogEntry structure │ +└────────────────────────────────────────────────────────────────────┘ + │ + │ RocksDB API + ▼ +┌────────────────────────────────────────────────────────────────────┐ +│ 4. RocksDB │ +│ db.put_cf(cf_handle, b"log:142", serialized_bytes) │ +└────────────────────────────────────────────────────────────────────┘ +``` + +#### 4. Responsibility Matrix + +| Responsibility | Storage Crate | Raft Crate | Common Crate | +|----------------|---------------|------------|--------------| +| **Data Types** | ColumnFamily, WriteBatch, StorageIterator, StorageOptions | VersionedLogEntry, RaftHardState, SnapshotMetadata | Type definitions shared across crates | +| **Serialization** | ❌ No | ✅ Yes (bincode) | N/A | +| **Deserialization** | ❌ No | ✅ Yes (bincode) | N/A | +| **Raft Semantics** | ❌ No | ✅ Yes | N/A | +| **Index Validation** | ✅ Yes (sequential) | ❌ No | N/A | +| **Fsync Logic** | ✅ Yes (per CF) | ❌ No | N/A | +| **Batch Atomicity** | ✅ Yes | ❌ No | N/A | +| **Column Family Selection** | ❌ No | ✅ Yes | N/A | +| **Snapshot Metadata** | ❌ No | ✅ Yes | N/A | +| **Error Handling** | RocksDB errors | Raft errors | Common error types | + +#### 5. Dependency Graph + +``` +seshat-storage (crates/storage) +├── Dependencies: +│ ├── rocksdb = "0.22" +│ ├── thiserror = "1.0" +│ └── tracing = "0.1" +└── NO dependency on seshat-common + +seshat-raft (crates/raft) +├── Dependencies: +│ ├── seshat-storage (local) +│ ├── seshat-common (local) +│ ├── openraft = "0.10" +│ ├── prost = "0.12" +│ ├── serde = { version = "1.0", features = ["derive"] } +│ └── tokio = { version = "1", features = ["full"] } + +seshat-common (crates/common) +├── Dependencies: +│ ├── prost = "0.12" +│ ├── serde = { version = "1.0", features = ["derive"] } +│ └── thiserror = "1.0" +``` + +#### 6. Testing Strategy for Boundaries + +**Unit Test - Storage Crate** (`crates/storage/tests/byte_interface_test.rs`): + +```rust +#[test] +fn test_storage_accepts_arbitrary_bytes() { + let storage = Storage::new(test_options()).unwrap(); + + // Storage should accept ANY bytes without caring about structure + let random_bytes: Vec = vec![0xFF, 0xAB, 0xCD, 0x12, 0x34]; + + storage.put(ColumnFamily::DataKv, b"test_key", &random_bytes).unwrap(); + + let retrieved = storage.get(ColumnFamily::DataKv, b"test_key").unwrap(); + assert_eq!(retrieved, Some(random_bytes)); + + // Storage has NO IDEA if this is valid VersionedLogEntry or garbage +} + +#[test] +fn test_storage_does_not_validate_structure() { + let storage = Storage::new(test_options()).unwrap(); + + // This is NOT a valid VersionedLogEntry, but storage doesn't care + let invalid_bytes = vec![0x00]; + + // Should succeed - storage doesn't validate + storage.append_log_entry(ColumnFamily::DataRaftLog, 1, &invalid_bytes).unwrap(); + + let retrieved = storage.get_log_range(ColumnFamily::DataRaftLog, 1, 2).unwrap(); + assert_eq!(retrieved, vec![invalid_bytes]); +} +``` + +**Integration Test - Raft Crate** (`crates/raft/tests/serialization_boundary_test.rs`): + +```rust +#[tokio::test] +async fn test_raft_adapter_handles_serialization() { + let storage = Storage::new(test_options()).unwrap(); + let adapter = RaftStorageAdapter::new(storage, 0); + + // Create properly typed entry + let entry = VersionedLogEntry { + version: 1, + term: 5, + index: 1, + entry_type: EntryType::Normal, + data: vec![1, 2, 3], + }; + + // Adapter serializes before passing to storage + adapter.append_entries(vec![entry.clone()]).await.unwrap(); + + // Adapter deserializes when retrieving + let retrieved = adapter.get_entries(1, 2).await.unwrap(); + + assert_eq!(retrieved.len(), 1); + assert_eq!(retrieved[0].index, 1); + assert_eq!(retrieved[0].term, 5); +} + +#[tokio::test] +async fn test_storage_deserialization_error_bubbles_up() { + let storage = Storage::new(test_options()).unwrap(); + + // Corrupt storage by writing invalid bytes directly + let invalid_bytes = vec![0xFF, 0xFF, 0xFF]; + storage.append_log_entry(ColumnFamily::SystemRaftLog, 1, &invalid_bytes).unwrap(); + + // Adapter should fail with deserialization error + let adapter = RaftStorageAdapter::new(storage, 0); + let result = adapter.get_entries(1, 2).await; + + assert!(matches!(result, Err(RaftError::Deserialization(_)))); +} +``` + +--- + +## Issue 3: Atomic Log Index Management + +### Problem Statement + +The `append_log_entry` method defines `InvalidLogIndex` error but doesn't specify: +1. Exact validation logic (what makes an index invalid?) +2. How to handle concurrent appends +3. How to track last_log_index efficiently +4. Edge cases: first entry, gaps, duplicates +5. Recovery strategies for validation failures + +### Root Cause Analysis + +Log index validation is critical for Raft safety: +- **Gaps** in log create inconsistencies (log[5] exists but log[4] missing) +- **Duplicates** indicate potential data corruption or race conditions +- **Non-sequential** writes violate Raft's sequential log guarantee + +Current design mentions validation but doesn't define the "how". + +### Architectural Solution + +#### 1. Index Validation Algorithm + +**File**: `crates/storage/src/lib.rs` (append_log_entry implementation) + +```rust +impl Storage { + /// Append log entry with sequential index validation. + /// + /// # Index Rules + /// 1. First entry: index MUST be 1 (not 0, Raft logs are 1-indexed) + /// 2. Subsequent entries: index MUST equal last_index + 1 + /// 3. Gaps or duplicates return InvalidLogIndex error + /// + /// # Concurrency + /// This method is NOT thread-safe for concurrent appends to same CF. + /// Caller must serialize append calls (single writer pattern). + /// + /// # Performance + /// - Caches last_log_index in memory (updated on each append) + /// - On first call or cache miss, queries RocksDB for last key + /// - Validation is O(1) using cached index + /// + /// # Error Recovery + /// - InvalidLogIndex: Caller should re-sync log from leader + /// - RocksDB errors: Retry with exponential backoff + pub fn append_log_entry( + &self, + cf: ColumnFamily, + index: u64, + entry_bytes: &[u8], + ) -> Result<()> { + // CRITICAL: Validate CF is a log column family + if !cf.is_log_cf() { + return Err(StorageError::InvalidColumnFamily { + cf: cf.as_str().to_string(), + reason: "append_log_entry only works on *_raft_log CFs".to_string(), + }); + } + + // Get or initialize last log index for this CF + let current_last_index = self.get_cached_last_log_index(cf)?; + + // Validate sequential index + let expected_index = match current_last_index { + None => { + // Empty log - first entry must be index 1 + if index != 1 { + return Err(StorageError::InvalidLogIndex { + cf: cf.as_str().to_string(), + expected: 1, + got: index, + reason: "First log entry must have index 1".to_string(), + }); + } + 1 + } + Some(last) => { + let expected = last + 1; + if index != expected { + // Gap or duplicate detected + let reason = if index <= last { + format!("Duplicate index (last was {})", last) + } else { + format!("Gap in log (expected {}, got {})", expected, index) + }; + + return Err(StorageError::InvalidLogIndex { + cf: cf.as_str().to_string(), + expected, + got: index, + reason, + }); + } + expected + } + }; + + // Build key: "log:{index}" + let key = format_log_key(index); + + // Write to RocksDB + let cf_handle = self.get_cf_handle(cf)?; + + // Apply fsync if this CF requires it (it doesn't for log CFs, only state CFs) + let write_opts = WriteOptions::default(); + + self.db.put_cf_opt(cf_handle, key.as_bytes(), entry_bytes, &write_opts) + .map_err(|e| StorageError::RocksDb(e))?; + + // Update cached last index AFTER successful write + self.update_cached_last_log_index(cf, index)?; + + tracing::debug!( + cf = cf.as_str(), + index = index, + size_bytes = entry_bytes.len(), + "Appended log entry" + ); + + Ok(()) + } + + /// Get cached last log index, initializing from RocksDB if needed. + /// + /// # Returns + /// - `Ok(Some(index))` - Last index found (from cache or DB) + /// - `Ok(None)` - Empty log + /// - `Err(_)` - RocksDB error + fn get_cached_last_log_index(&self, cf: ColumnFamily) -> Result> { + // Check in-memory cache first + let cache_guard = self.last_log_index_cache.read().unwrap(); + + if let Some(&cached_index) = cache_guard.get(&cf) { + return Ok(Some(cached_index)); + } + + drop(cache_guard); // Release read lock before expensive DB operation + + // Cache miss - query RocksDB for last key + let last_index = self.get_last_log_index_from_db(cf)?; + + // Update cache + if let Some(index) = last_index { + let mut cache_guard = self.last_log_index_cache.write().unwrap(); + cache_guard.insert(cf, index); + } + + Ok(last_index) + } + + /// Query RocksDB for last log index (expensive operation). + fn get_last_log_index_from_db(&self, cf: ColumnFamily) -> Result> { + let cf_handle = self.get_cf_handle(cf)?; + + // Create reverse iterator (starts at end of CF) + let mut iter = self.db.iterator_cf(cf_handle, IteratorMode::End); + + // Get last key-value pair + match iter.next() { + Some(Ok((key_bytes, _))) => { + // Parse key: "log:12345" -> 12345 + let key_str = String::from_utf8_lossy(&key_bytes); + + if let Some(index_str) = key_str.strip_prefix("log:") { + let index = index_str.parse::() + .map_err(|e| StorageError::CorruptedData { + cf: cf.as_str().to_string(), + key: key_bytes.to_vec(), + reason: format!("Invalid log key format: {}", e), + })?; + + Ok(Some(index)) + } else { + // Key doesn't match expected format + Err(StorageError::CorruptedData { + cf: cf.as_str().to_string(), + key: key_bytes.to_vec(), + reason: "Log key missing 'log:' prefix".to_string(), + }) + } + } + Some(Err(e)) => Err(StorageError::RocksDb(e)), + None => Ok(None), // Empty CF + } + } + + /// Update cached last log index after successful append. + fn update_cached_last_log_index(&self, cf: ColumnFamily, new_index: u64) -> Result<()> { + let mut cache_guard = self.last_log_index_cache.write().unwrap(); + cache_guard.insert(cf, new_index); + Ok(()) + } +} + +/// Format log key from index with zero-padding for correct lexicographic ordering. +/// +/// # Format +/// Returns a 24-byte key: "log:" (4 bytes) + zero-padded index (20 digits) +/// Example: index 1 -> "log:00000000000000000001" +/// +/// # Rationale +/// RocksDB uses lexicographic byte ordering. Without zero-padding: +/// - Wrong order: "log:1", "log:10", "log:100", "log:2" (incorrect!) +/// - Correct order: "log:00000000000000000001", "log:00000000000000000002", ... "log:00000000000000000010" +/// +/// 20 digits supports indices up to 10^20 - 1 (far beyond practical limits). +fn format_log_key(index: u64) -> String { + format!("log:{:020}", index) +} + +/// Parse log index from key bytes. +/// +/// # Format +/// Expects "log:{20-digit-zero-padded-index}" +/// Example: "log:00000000000000000042" -> 42 +fn parse_log_index(key: &[u8]) -> Result { + let key_str = String::from_utf8_lossy(key); + + key_str + .strip_prefix("log:") + .and_then(|s| s.parse::().ok()) + .ok_or_else(|| StorageError::CorruptedData { + cf: "unknown".to_string(), + key: key.to_vec(), + reason: "Invalid log key format (expected 'log:{20-digit-index}')".to_string(), + }) +} +``` + +#### 2. Storage Struct with Index Cache + +**File**: `crates/storage/src/lib.rs` + +```rust +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; + +/// Storage instance with cached last log indices. +pub struct Storage { + db: Arc, + cf_handles: HashMap>>, + config: StorageOptions, + + /// Cache of last log index per CF. + /// Key: ColumnFamily (only log CFs) + /// Value: Last appended log index + /// + /// Synchronized with RwLock: + /// - Read lock for cached lookups (hot path) + /// - Write lock for cache updates (after successful append) + /// + /// Cache is invalidated on: + /// - truncate_log_before (resets to new last index) + /// - restore_snapshot (clears cache, forces re-query) + last_log_index_cache: Arc>>, +} + +impl Storage { + pub fn new(options: StorageOptions) -> Result { + // ... existing initialization code ... + + let storage = Self { + db: Arc::new(db), + cf_handles, + config: options, + last_log_index_cache: Arc::new(RwLock::new(HashMap::new())), + }; + + // Warm up cache by querying last index for each log CF + storage.warm_up_index_cache()?; + + Ok(storage) + } + + /// Pre-populate index cache on startup. + fn warm_up_index_cache(&self) -> Result<()> { + let log_cfs = vec![ + ColumnFamily::SystemRaftLog, + ColumnFamily::DataRaftLog, + ]; + + for cf in log_cfs { + if let Some(last_index) = self.get_last_log_index_from_db(cf)? { + let mut cache = self.last_log_index_cache.write().unwrap(); + cache.insert(cf, last_index); + + tracing::info!( + cf = cf.as_str(), + last_index = last_index, + "Warmed up log index cache" + ); + } + } + + Ok(()) + } + + /// Clear index cache (called after snapshot restoration). + pub(crate) fn invalidate_index_cache(&self) { + let mut cache = self.last_log_index_cache.write().unwrap(); + cache.clear(); + tracing::debug!("Invalidated log index cache"); + } +} +``` + +#### 3. Edge Case Handling + +**Scenario 1: First Entry in Empty Log** + +```rust +// Initial state: empty log +assert_eq!(storage.get_last_log_index(ColumnFamily::DataRaftLog)?, None); + +// First append MUST use index 1 +storage.append_log_entry(ColumnFamily::DataRaftLog, 1, b"entry_1")?; // ✅ OK + +// This would fail (index 0 not allowed) +storage.append_log_entry(ColumnFamily::DataRaftLog, 0, b"entry_0")?; // ❌ Error +``` + +**Scenario 2: Detecting Gap** + +```rust +// State: last_index = 5 +storage.append_log_entry(ColumnFamily::DataRaftLog, 5, b"entry_5")?; + +// Next append MUST be index 6 +storage.append_log_entry(ColumnFamily::DataRaftLog, 7, b"entry_7")?; // ❌ Error + +// Error returned: +// InvalidLogIndex { expected: 6, got: 7, reason: "Gap in log (expected 6, got 7)" } +``` + +**Scenario 3: Detecting Duplicate** + +```rust +// State: last_index = 10 +storage.append_log_entry(ColumnFamily::DataRaftLog, 10, b"entry_10")?; + +// Duplicate append +storage.append_log_entry(ColumnFamily::DataRaftLog, 10, b"entry_10_dup")?; // ❌ Error + +// Error returned: +// InvalidLogIndex { expected: 11, got: 10, reason: "Duplicate index (last was 10)" } +``` + +**Scenario 4: Log Truncation (Compaction)** + +```rust +// State: log indices [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] +// Last index = 10 + +// Compact: delete entries before index 8 (keep 8, 9, 10) +storage.truncate_log_before(ColumnFamily::DataRaftLog, 8)?; + +// After truncation: log indices [8, 9, 10] +// Last index still = 10 + +// Next append continues from 10 +storage.append_log_entry(ColumnFamily::DataRaftLog, 11, b"entry_11")?; // ✅ OK +``` + +**Scenario 5: Concurrent Append Attempts (Race Condition)** + +```rust +// Thread 1 and Thread 2 both try to append index 6 +// State: last_index = 5 + +// Thread 1: Gets cached last_index = 5, expects index 6 +// Thread 2: Gets cached last_index = 5, expects index 6 (same!) + +// Thread 1 writes first: +storage.append_log_entry(ColumnFamily::DataRaftLog, 6, b"thread1_data")?; // ✅ OK +// Cache updated: last_index = 6 + +// Thread 2 writes next: +storage.append_log_entry(ColumnFamily::DataRaftLog, 6, b"thread2_data")?; // ❌ Error +// Error: InvalidLogIndex { expected: 7, got: 6, reason: "Duplicate index (last was 6)" } + +// SOLUTION: Caller (raft crate) MUST serialize append calls +``` + +#### 4. Truncate Implementation with Cache Update + +**File**: `crates/storage/src/lib.rs` + +```rust +impl Storage { + /// Delete all log entries with index < truncate_index. + /// + /// # Atomicity + /// Uses WriteBatch for atomic deletion of all matching keys. + /// + /// # Cache Behavior + /// Does NOT modify cache unless entire log is deleted. + /// Cache remains valid because last_index doesn't change + /// (we only delete prefix of log). + pub fn truncate_log_before(&self, cf: ColumnFamily, truncate_index: u64) -> Result<()> { + if !cf.is_log_cf() { + return Err(StorageError::InvalidColumnFamily { + cf: cf.as_str().to_string(), + reason: "truncate_log_before only works on *_raft_log CFs".to_string(), + }); + } + + // Get current last index + let last_index = self.get_cached_last_log_index(cf)?; + + // Edge case: truncate entire log + if let Some(last) = last_index { + if truncate_index > last { + // Truncating past end of log - delete everything + tracing::warn!( + cf = cf.as_str(), + truncate_index = truncate_index, + last_index = last, + "Truncating entire log (truncate_index > last_index)" + ); + + // Build batch to delete all entries + let mut batch = WriteBatch::default(); + let cf_handle = self.get_cf_handle(cf)?; + + for index in 1..=last { + let key = format_log_key(index); + batch.delete_cf(cf_handle, key.as_bytes()); + } + + self.db.write(batch)?; + + // Invalidate cache - log is now empty + let mut cache = self.last_log_index_cache.write().unwrap(); + cache.remove(&cf); + + tracing::info!(cf = cf.as_str(), "Truncated entire log, cache invalidated"); + return Ok(()); + } + } + + // Normal case: truncate prefix [1, truncate_index) + let mut batch = WriteBatch::default(); + let cf_handle = self.get_cf_handle(cf)?; + + let mut deleted_count = 0; + for index in 1..truncate_index { + let key = format_log_key(index); + batch.delete_cf(cf_handle, key.as_bytes()); + deleted_count += 1; + } + + // Atomic commit + self.db.write(batch)?; + + tracing::info!( + cf = cf.as_str(), + truncate_index = truncate_index, + deleted_count = deleted_count, + "Truncated log prefix" + ); + + // Cache remains valid (last_index unchanged) + Ok(()) + } +} +``` + +#### 5. Concurrency Model + +**Single Writer Pattern (Enforced by Raft)**: + +```rust +// In raft crate - RaftStorageAdapter +impl RaftStorageAdapter { + /// Append entries with internal mutex to prevent concurrent appends. + /// + /// openraft guarantees sequential calls, but we add defensive mutex. + pub async fn append_entries(&self, entries: Vec) -> Result<()> { + // Acquire lock to ensure sequential appends + let _guard = self.append_lock.lock().await; + + for entry in entries { + let entry_bytes = bincode::serialize(&entry)?; + self.storage.append_log_entry(self.log_cf(), entry.index, &entry_bytes)?; + } + + Ok(()) + } +} +``` + +**Why Single Writer is Sufficient**: +1. **openraft** guarantees: Only leader appends to its own log +2. Followers receive entries via RPC and append sequentially +3. No concurrent writers to same log from different threads +4. Cache is thread-safe (RwLock) but appends are serialized by design + +**Performance Impact**: None - Raft's sequential log guarantee means no parallelism opportunity anyway. + +#### 6. Error Recovery Strategies + +**Error**: `InvalidLogIndex { expected: 6, got: 7 }` + +**Cause**: Gap in log (entry 6 missing) + +**Recovery**: +```rust +// In raft crate - when append fails +match storage.append_log_entry(cf, index, bytes) { + Err(StorageError::InvalidLogIndex { expected, got, .. }) => { + tracing::error!( + expected = expected, + got = got, + "Log index validation failed - initiating log sync" + ); + + // Request missing entries from leader + self.request_log_sync_from_leader(expected, got).await?; + + // Retry append after sync + storage.append_log_entry(cf, index, bytes)?; + } + Err(e) => return Err(e), + Ok(()) => {} +} +``` + +**Error**: `InvalidLogIndex { expected: 11, got: 10 }` (Duplicate) + +**Cause**: Entry 10 already exists (duplicate append) + +**Recovery**: +```rust +// This indicates a logic bug in raft layer +// Duplicates should not happen under normal operation + +tracing::error!( + expected = expected, + got = got, + "Duplicate log index detected - this indicates a bug" +); + +// Option 1: Overwrite (dangerous, data loss risk) +// storage.put(cf, format_log_key(got), bytes)?; // NOT RECOMMENDED + +// Option 2: Ignore (safe, assume idempotent retry) +tracing::warn!("Ignoring duplicate append (assuming idempotent retry)"); +return Ok(()); + +// Option 3: Panic (fail-fast for debugging) +panic!("Duplicate log index {} - raft layer bug", got); +``` + +#### 7. Performance Optimizations + +**Optimization 1: In-Memory Cache** +- Cached `last_log_index` per CF avoids expensive RocksDB query on every append +- Cache hit: O(1) HashMap lookup +- Cache miss: O(log n) RocksDB reverse iterator (only on startup or after cache invalidation) + +**Optimization 2: Batch Validation** + +```rust +/// Append multiple entries with single validation check. +pub fn append_log_entries_batch( + &self, + cf: ColumnFamily, + entries: Vec<(u64, Vec)>, // (index, bytes) pairs +) -> Result<()> { + if entries.is_empty() { + return Ok(()); + } + + // Validate all indices are sequential + let current_last = self.get_cached_last_log_index(cf)?; + let expected_start = current_last.map(|i| i + 1).unwrap_or(1); + + for (i, (index, _)) in entries.iter().enumerate() { + let expected = expected_start + i as u64; + if *index != expected { + return Err(StorageError::InvalidLogIndex { + cf: cf.as_str().to_string(), + expected, + got: *index, + reason: format!("Batch validation failed at position {}", i), + }); + } + } + + // All indices valid - batch write + let mut batch = WriteBatch::default(); + let cf_handle = self.get_cf_handle(cf)?; + + for (index, bytes) in &entries { + let key = format_log_key(*index); + batch.put_cf(cf_handle, key.as_bytes(), bytes); + } + + self.db.write(batch)?; + + // Update cache to last entry + let last_index = entries.last().unwrap().0; + self.update_cached_last_log_index(cf, last_index)?; + + tracing::debug!( + cf = cf.as_str(), + count = entries.len(), + start_index = entries[0].0, + end_index = last_index, + "Appended batch of log entries" + ); + + Ok(()) +} +``` + +**Optimization 3: Avoid Fsync on Log Appends** + +Log entries use async WAL (no fsync), only hard state requires fsync: + +```rust +impl ColumnFamily { + /// Returns true if writes to this CF require immediate fsync. + pub fn requires_fsync(&self) -> bool { + matches!(self, + ColumnFamily::SystemRaftState | ColumnFamily::DataRaftState + ) + } +} + +// In put() implementation: +let write_opts = if cf.requires_fsync() { + let mut opts = WriteOptions::default(); + opts.set_sync(true); // Blocks until fsync + opts +} else { + WriteOptions::default() // Async WAL +}; +``` + +#### 8. Testing Strategy + +**Unit Test - Sequential Index Validation**: + +```rust +#[test] +fn test_append_requires_sequential_indices() { + let storage = Storage::new(test_options()).unwrap(); + let cf = ColumnFamily::DataRaftLog; + + // First entry must be index 1 + storage.append_log_entry(cf, 1, b"entry1").unwrap(); + + // Second entry must be index 2 + storage.append_log_entry(cf, 2, b"entry2").unwrap(); + + // Gap: trying to append index 4 (missing index 3) + let result = storage.append_log_entry(cf, 4, b"entry4"); + assert!(matches!(result, Err(StorageError::InvalidLogIndex { expected: 3, got: 4, .. }))); + + // Fix gap by appending index 3 + storage.append_log_entry(cf, 3, b"entry3").unwrap(); + + // Now index 4 works + storage.append_log_entry(cf, 4, b"entry4").unwrap(); +} + +#[test] +fn test_append_rejects_duplicates() { + let storage = Storage::new(test_options()).unwrap(); + let cf = ColumnFamily::DataRaftLog; + + storage.append_log_entry(cf, 1, b"entry1").unwrap(); + + // Duplicate index 1 + let result = storage.append_log_entry(cf, 1, b"entry1_dup"); + assert!(matches!(result, Err(StorageError::InvalidLogIndex { expected: 2, got: 1, .. }))); +} + +#[test] +fn test_first_entry_must_be_index_one() { + let storage = Storage::new(test_options()).unwrap(); + let cf = ColumnFamily::DataRaftLog; + + // Index 0 not allowed + let result = storage.append_log_entry(cf, 0, b"entry0"); + assert!(matches!(result, Err(StorageError::InvalidLogIndex { expected: 1, got: 0, .. }))); + + // Index 2 not allowed as first entry + let result = storage.append_log_entry(cf, 2, b"entry2"); + assert!(matches!(result, Err(StorageError::InvalidLogIndex { expected: 1, got: 2, .. }))); + + // Index 1 is correct + storage.append_log_entry(cf, 1, b"entry1").unwrap(); +} +``` + +**Unit Test - Cache Behavior**: + +```rust +#[test] +fn test_cache_updated_after_append() { + let storage = Storage::new(test_options()).unwrap(); + let cf = ColumnFamily::DataRaftLog; + + // Cache should be None initially (empty log) + assert_eq!(storage.get_cached_last_log_index(cf).unwrap(), None); + + // Append updates cache + storage.append_log_entry(cf, 1, b"entry1").unwrap(); + assert_eq!(storage.get_cached_last_log_index(cf).unwrap(), Some(1)); + + storage.append_log_entry(cf, 2, b"entry2").unwrap(); + assert_eq!(storage.get_cached_last_log_index(cf).unwrap(), Some(2)); +} + +#[test] +fn test_cache_survives_truncation() { + let storage = Storage::new(test_options()).unwrap(); + let cf = ColumnFamily::DataRaftLog; + + // Append 10 entries + for i in 1..=10 { + storage.append_log_entry(cf, i, format!("entry{}", i).as_bytes()).unwrap(); + } + + assert_eq!(storage.get_cached_last_log_index(cf).unwrap(), Some(10)); + + // Truncate first 5 entries (log now has 6-10) + storage.truncate_log_before(cf, 6).unwrap(); + + // Cache should still show last_index = 10 + assert_eq!(storage.get_cached_last_log_index(cf).unwrap(), Some(10)); + + // Next append continues from 10 + storage.append_log_entry(cf, 11, b"entry11").unwrap(); +} + +#[test] +fn test_cache_invalidated_when_entire_log_truncated() { + let storage = Storage::new(test_options()).unwrap(); + let cf = ColumnFamily::DataRaftLog; + + // Append 5 entries + for i in 1..=5 { + storage.append_log_entry(cf, i, format!("entry{}", i).as_bytes()).unwrap(); + } + + // Truncate past end (deletes entire log) + storage.truncate_log_before(cf, 100).unwrap(); + + // Cache should be invalidated (None) + assert_eq!(storage.get_cached_last_log_index(cf).unwrap(), None); + + // Can start fresh from index 1 + storage.append_log_entry(cf, 1, b"new_entry1").unwrap(); +} +``` + +**Property Test - Append Sequence Invariant**: + +```rust +use proptest::prelude::*; + +proptest! { + #[test] + fn test_sequential_appends_always_succeed(count in 1usize..100) { + let storage = Storage::new(test_options()).unwrap(); + let cf = ColumnFamily::DataRaftLog; + + // Sequential appends starting from 1 should always succeed + for i in 1..=count { + let data = format!("entry{}", i); + storage.append_log_entry(cf, i as u64, data.as_bytes()).unwrap(); + } + + // Verify last index + let last = storage.get_last_log_index(cf).unwrap(); + assert_eq!(last, Some(count as u64)); + } + + #[test] + fn test_random_indices_fail_validation(indices in prop::collection::vec(1u64..1000, 10..50)) { + let storage = Storage::new(test_options()).unwrap(); + let cf = ColumnFamily::DataRaftLog; + + // Random indices should mostly fail (unless accidentally sequential) + let mut last_success = 0u64; + + for index in indices { + let expected = last_success + 1; + let result = storage.append_log_entry(cf, index, b"data"); + + if index == expected { + // Correct index - should succeed + assert!(result.is_ok()); + last_success = index; + } else { + // Wrong index - should fail + assert!(result.is_err()); + } + } + } +} +``` + +**Integration Test - Concurrent Safety**: + +```rust +#[tokio::test] +async fn test_concurrent_appends_with_mutex() { + let storage = Arc::new(Storage::new(test_options()).unwrap()); + let cf = ColumnFamily::DataRaftLog; + + // Mutex to serialize appends (mimics raft layer behavior) + let append_lock = Arc::new(tokio::sync::Mutex::new(())); + + // Spawn 10 tasks trying to append concurrently + let handles: Vec<_> = (1..=10) + .map(|i| { + let storage = storage.clone(); + let lock = append_lock.clone(); + + tokio::spawn(async move { + let _guard = lock.lock().await; // Serialize + storage.append_log_entry(cf, i, format!("entry{}", i).as_bytes()).unwrap(); + }) + }) + .collect(); + + // Wait for all tasks + for handle in handles { + handle.await.unwrap(); + } + + // Verify all 10 entries appended successfully + assert_eq!(storage.get_last_log_index(cf).unwrap(), Some(10)); +} +``` + +--- + +## Summary of Changes Required + +### File: `crates/storage/src/lib.rs` + +1. **Remove all domain type references** from doc comments and API signatures +2. **Add index cache field** to `Storage` struct: + ```rust + last_log_index_cache: Arc>> + ``` +3. **Implement full validation logic** in `append_log_entry()`: + - Cache-first lookup for last_index + - Sequential index validation + - Detailed error messages +4. **Add helper methods**: + - `get_cached_last_log_index()` + - `get_last_log_index_from_db()` + - `update_cached_last_log_index()` + - `warm_up_index_cache()` + - `invalidate_index_cache()` +5. **Update truncate_log_before()** to handle cache invalidation +6. **Add batch append optimization** (optional): `append_log_entries_batch()` + +### File: `crates/storage/src/column_family.rs` + +1. **Add helper method**: + ```rust + pub fn is_log_cf(&self) -> bool { + matches!(self, ColumnFamily::SystemRaftLog | ColumnFamily::DataRaftLog) + } + ``` + +### File: `crates/storage/src/error.rs` + +1. **Enhance InvalidLogIndex error**: + ```rust + #[error("Invalid log index in CF {cf}: expected {expected}, got {got} ({reason})")] + InvalidLogIndex { + cf: String, + expected: u64, + got: u64, + reason: String, + } + ``` +2. **Add InvalidColumnFamily error**: + ```rust + #[error("Invalid column family for operation: {cf} ({reason})")] + InvalidColumnFamily { + cf: String, + reason: String, + } + ``` + +### File: `crates/raft/src/storage_adapter.rs` (NEW) + +1. **Create adapter struct** that bridges openraft and storage crate +2. **Implement serialization/deserialization** in adapter methods +3. **Add append mutex** for defensive concurrency control +4. **Implement error mapping** from `StorageError` to `RaftError` + +### File: `crates/storage/tests/index_validation_test.rs` (NEW) + +1. Add all unit tests from section 8 + +### File: `crates/storage/tests/property_tests.rs` + +1. Add property tests for index validation + +### File: `docs/specs/rocksdb/design.md` + +**Lines to update**: + +1. **Lines 270-308** (Column Family Setup): Remove type references, replace with "serialized bytes" + ```markdown + - Value: Serialized log entry bytes (format determined by caller) + ``` + +2. **Lines 440-453** (Raft Integration Flow): Update to show serialization happening in raft crate: + ```markdown + 3. RaftStorage serializes: entries -> Vec> using bincode + 4. RaftStorage calls: storage.append_log_entry(cf, index, &bytes) + 5. Storage validates: Sequential index (no gaps, no duplicates) + 6. Storage calls: db.put_cf(cf_handle, b"log:{index}", bytes) + ``` + +3. **Lines 596-602** (Integration section): Add new subsection: + ```markdown + ### Serialization Boundary + + **Raft Crate Responsibilities**: + - Define domain types (VersionedLogEntry, RaftHardState, etc.) + - Serialize before calling storage + - Deserialize after retrieving from storage + - Handle version mismatches + + **Storage Crate Responsibilities**: + - Accept only `&[u8]` and `Vec` + - No knowledge of data structures + - Pure persistence operations + - Index validation (structural, not semantic) + ``` + +4. **Lines 67-72** (append_log_entry signature): Update to show validation logic: + ```rust + /// Append log entry with automatic index validation. + /// + /// Validates that index is sequential (first = 1, subsequent = last + 1). + /// Returns InvalidLogIndex error if gap or duplicate detected. + /// Caller must serialize entry to bytes before calling. + pub fn append_log_entry(&self, cf: ColumnFamily, index: u64, entry_bytes: &[u8]) -> Result<()>; + ``` + +### File: `docs/specs/rocksdb/spec.md` + +**Lines to update**: + +1. **Line 36** (Business Rules): Add validation rule: + ```markdown + - Log indices MUST be sequential starting at 1 (no gaps, no duplicates) + - Storage layer validates index sequence, rejects invalid appends + - Raft layer responsible for serialization/deserialization + ``` + +2. **Lines 78-79** (Integration Points): Clarify: + ```markdown + - raft crate - serializes Raft types, validates Raft semantics, calls storage with bytes + - storage crate - validates sequential indices, provides byte-level persistence + ``` + +--- + +## Implementation Checklist + +For the coder-agent to implement: + +- [ ] Update `Storage` struct with index cache field +- [ ] Implement cache initialization in `Storage::new()` +- [ ] Implement full `append_log_entry()` with validation +- [ ] Add cache helper methods (get, update, warm_up, invalidate) +- [ ] Update `truncate_log_before()` for cache handling +- [ ] Add `is_log_cf()` to `ColumnFamily` +- [ ] Enhance `StorageError` variants +- [ ] Remove all domain type references from storage API docs +- [ ] Create `RaftStorageAdapter` in raft crate +- [ ] Write unit tests for index validation +- [ ] Write property tests for sequential invariant +- [ ] Write integration tests for concurrent appends +- [ ] Update design.md with boundary clarifications +- [ ] Update spec.md with validation rules + +--- + +## Validation Criteria + +These solutions are correct if: + +1. **Boundary Test**: `cargo build -p seshat-storage` succeeds WITHOUT depending on `seshat-common` +2. **API Test**: All storage public methods accept only `&[u8]` or `Vec` (no domain types) +3. **Validation Test**: Sequential appends succeed, gaps/duplicates fail with detailed errors +4. **Cache Test**: `get_cached_last_log_index()` returns O(1) after warm-up +5. **Truncate Test**: Cache correctly handles partial and full log truncation +6. **Concurrency Test**: Mutex-protected appends pass with 100% success rate +7. **Integration Test**: Raft adapter successfully serializes/deserializes all Raft types + +--- + +**End of Document** diff --git a/docs/specs/rocksdb/context.json b/docs/specs/rocksdb/context.json new file mode 100644 index 0000000..e09e20c --- /dev/null +++ b/docs/specs/rocksdb/context.json @@ -0,0 +1,56 @@ +{ + "product_vision": "Phase 1 MVP: Persistent storage layer for Seshat distributed key-value store. Must support >5,000 ops/sec with <10ms p99 latency and pass 11 chaos tests. Storage enables cluster recovery after restarts and maintains consistency across nodes.", + "existing_features": [ + "RESP protocol parser/encoder (100% complete - 487 tests)", + "Raft consensus foundation (in design phase)" + ], + "storage_responsibilities": "Abstract RocksDB storage with column families, atomic batch writes, snapshot creation, iterator support, storage metrics, enforce data size limits. No understanding of Raft semantics - just stores bytes.", + "data_types_to_persist": [ + "VersionedLogEntry - Raft log entries with schema versioning", + "RaftHardState - Current term, vote, commit index", + "ClusterMembership - Node registry with addresses and states", + "ShardMap - Shard assignments and replica placement", + "StoredValue - User key-value data with metadata and optional TTL", + "SnapshotMetadata - Snapshot tracking for log compaction" + ], + "column_family_design": { + "phase_1": [ + "system_raft_log: System group Raft log entries (~10MB compacted)", + "system_raft_state: System group hard state (<1KB, fsync required)", + "system_data: Cluster metadata (~100KB)", + "data_raft_log: Data shard Raft log entries (~100MB compacted)", + "data_raft_state: Data shard hard state (<1KB, fsync required)", + "data_kv: User key-value data (unbounded)" + ], + "compaction_strategy": "Snapshot every 10,000 entries or 100MB log size using RocksDB checkpoints", + "configuration": "Lz4 compression, 64MB write buffer, 64MB SST files, prefix bloom filters for data_kv CF" + }, + "testing_requirements": { + "tdd_workflow": "Test → Code → Refactor pattern required", + "test_types": [ + "Unit tests: Column family operations", + "Integration tests: Snapshot consistency", + "Performance tests: Throughput and latency benchmarks", + "Property tests: Invariant checking with proptest" + ], + "chaos_tests": "Must support 11 chaos scenarios including node failures, partitions, and storage saturation" + }, + "dependencies": { + "depends_on": ["common crate for shared types and error handling"], + "used_by": ["raft crate (implements Storage trait)", "kv crate (indirectly via raft)"] + }, + "architecture_layer": "Storage Layer - bottom of stack, provides persistent state to Raft Layer above it", + "performance_targets": { + "throughput": ">5,000 ops/sec per node", + "latency": { + "GET": "<5ms p99", + "SET": "<10ms p99 (includes Raft replication)" + }, + "key_operations": "<1ms p99 latency for storage-only ops" + }, + "resource_limits": { + "max_key_size_bytes": 256, + "max_value_size_bytes": 65536, + "max_memory_per_raft_log_mb": 512 + } +} diff --git a/docs/specs/rocksdb/design.json b/docs/specs/rocksdb/design.json new file mode 100644 index 0000000..35ed8bb --- /dev/null +++ b/docs/specs/rocksdb/design.json @@ -0,0 +1,754 @@ +{ + "requirements": { + "entities": [ + "Storage - Main struct managing RocksDB instance and column families", + "StorageOptions - Configuration for RocksDB initialization (paths, options per CF)", + "ColumnFamily - Type-safe enum for 6 column families", + "WriteBatch - Atomic multi-CF write operations", + "StorageIterator - Iterator for range queries within column families", + "StorageMetrics - Tracking db_size_bytes, num_keys, snapshot_duration", + "StorageError - Rich error context with thiserror" + ], + "data_persistence": { + "column_families": [ + "system_raft_log - System group Raft log entries (key: log:{index}, value: VersionedLogEntry)", + "system_raft_state - System group hard state (key: state, value: RaftHardState, fsync required)", + "system_data - Cluster metadata (keys: membership/shardmap, values: ClusterMembership/ShardMap)", + "data_raft_log - Data shard Raft log entries (key: log:{index}, value: VersionedLogEntry)", + "data_raft_state - Data shard hard state (key: state, value: RaftHardState, fsync required)", + "data_kv - User key-value data (key: raw bytes, value: StoredValue)" + ], + "serialization": "bincode - efficient binary serialization for all persisted structures", + "snapshot_strategy": "RocksDB checkpoint using hard links (atomic, space-efficient, no data copying)" + }, + "api_needed": { + "public_interface": "Storage struct with public methods (no trait needed for Phase 1)", + "operations": [ + "get(cf, key) -> Result>>", + "put(cf, key, value) -> Result<()>", + "delete(cf, key) -> Result<()>", + "exists(cf, key) -> Result", + "batch_write(batch) -> Result<()>", + "append_log_entry(cf, index, entry) -> Result<()>", + "get_log_range(cf, start_index, end_index) -> Result>", + "truncate_log_before(cf, index) -> Result<()>", + "create_snapshot(path) -> Result", + "restore_snapshot(path) -> Result<()>", + "iterator(cf, range) -> Result", + "metrics() -> StorageMetrics" + ] + }, + "components": { + "main_struct": "Storage - Owns RocksDB DB instance, manages column family handles, provides thread-safe access via Arc", + "helper_types": [ + "WriteBatch - Builder pattern for atomic multi-CF writes", + "StorageIterator - Wrapper around RocksDB iterator with key/value decoding", + "ColumnFamilyHandle - Internal wrapper for RocksDB CF handles", + "CheckpointManager - Helper for snapshot creation and restoration" + ] + }, + "business_rules": [ + "All write batches across column families MUST be atomic (use RocksDB WriteBatch)", + "Raft state CFs (system_raft_state, data_raft_state) MUST fsync before returning from put()", + "All persisted structures MUST include version field for schema evolution (checked on deserialization)", + "Sequential ordering MUST be preserved for Raft log entries (no gaps in indices)", + "Storage layer MUST NOT understand Raft semantics - only stores bytes as directed", + "Version mismatches MUST cause fail-fast errors (refuse to start with incompatible data)", + "Concurrent reads and writes MUST be thread-safe (RocksDB guarantees this)", + "Snapshot creation MUST be atomic (checkpoint or nothing)", + "Iterator results MUST be consistent within snapshot isolation" + ], + "domains": [ + "Storage layer only - lowest level in architecture", + "No Raft logic (handled by raft crate above)", + "No protocol parsing (handled by protocol-resp crate)", + "No business logic (handled by kv service layer)", + "Pure persistence abstraction over RocksDB" + ] + }, + "technical_needs": { + "core_types": { + "Storage": { + "description": "Main storage abstraction wrapping RocksDB with column family management", + "fields": [ + "db: Arc - Shared RocksDB instance for multi-threaded access", + "cf_handles: HashMap> - Column family handle cache", + "metrics: Arc> - Thread-safe metrics tracking", + "config: StorageOptions - Immutable configuration" + ], + "methods": [ + "new(options: StorageOptions) -> Result - Open or create RocksDB with all CFs", + "get(cf: ColumnFamily, key: &[u8]) -> Result>> - Point read from CF", + "put(cf: ColumnFamily, key: &[u8], value: &[u8]) -> Result<()> - Write to CF (fsync for state CFs)", + "delete(cf: ColumnFamily, key: &[u8]) -> Result<()> - Delete from CF", + "exists(cf: ColumnFamily, key: &[u8]) -> Result - Check key existence", + "batch_write(batch: WriteBatch) -> Result<()> - Atomic multi-CF write", + "append_log_entry(cf: ColumnFamily, index: u64, entry: &[u8]) -> Result<()> - Optimized log append", + "get_log_range(cf: ColumnFamily, start: u64, end: u64) -> Result>> - Scan log range", + "truncate_log_before(cf: ColumnFamily, index: u64) -> Result<()> - Delete log entries < index", + "create_snapshot(path: &Path) -> Result - Create RocksDB checkpoint", + "restore_snapshot(path: &Path) -> Result<()> - Restore from checkpoint", + "iterator(cf: ColumnFamily, mode: IteratorMode) -> Result - Create CF iterator", + "metrics(&self) -> StorageMetrics - Get current metrics snapshot", + "sync(&self) -> Result<()> - Force fsync of WAL", + "close(self) -> Result<()> - Graceful shutdown" + ], + "thread_safety": "Arc enables safe multi-threaded access; RocksDB handles internal synchronization", + "lifecycle": "Created on node startup, closed on shutdown; survives across Raft leader changes" + }, + "StorageOptions": { + "description": "Configuration for RocksDB initialization", + "fields": [ + "data_dir: PathBuf - Base directory for RocksDB files", + "create_if_missing: bool - Create DB if doesn't exist (true for bootstrap, false for join)", + "compression: CompressionType - Compression algorithm (Lz4 for Phase 1)", + "write_buffer_size_mb: usize - Per-CF write buffer size (64MB default)", + "max_write_buffer_number: usize - Number of memtables (3 default)", + "target_file_size_mb: usize - SST file size target (64MB default)", + "max_open_files: i32 - OS file handle limit (-1 for unlimited)", + "enable_statistics: bool - RocksDB internal stats (true for Phase 4)", + "cf_options: HashMap - Per-CF tuning" + ], + "methods": [ + "default() -> Self - Sensible defaults for Phase 1", + "with_data_dir(path: PathBuf) -> Self - Builder pattern", + "validate(&self) -> Result<()> - Pre-startup validation" + ] + }, + "CFOptions": { + "description": "Per-column-family RocksDB options", + "fields": [ + "compaction_style: DBCompactionStyle - Level vs Universal (Level for Raft logs)", + "disable_auto_compactions: bool - Manual compaction control (false default)", + "level0_file_num_compaction_trigger: i32 - Compaction threshold (4 default)", + "write_buffer_size: usize - Override global write buffer", + "prefix_extractor: Option - For prefix bloom filters (data_kv only)" + ] + }, + "ColumnFamily": { + "description": "Type-safe enum for all column families", + "variants": [ + "SystemRaftLog - System Raft group log entries", + "SystemRaftState - System Raft group hard state", + "SystemData - Cluster metadata", + "DataRaftLog - Data shard Raft log entries", + "DataRaftState - Data shard hard state", + "DataKv - User key-value data" + ], + "methods": [ + "as_str(&self) -> &'static str - RocksDB CF name", + "all() -> [ColumnFamily; 6] - All CFs for initialization", + "requires_fsync(&self) -> bool - True for *_raft_state CFs", + "default_options(&self) -> CFOptions - Optimized options per CF type" + ], + "design_rationale": "Enum prevents typos and enables compile-time CF validation" + }, + "WriteBatch": { + "description": "Builder for atomic multi-CF write operations", + "fields": [ + "inner: rocksdb::WriteBatch - Underlying RocksDB batch", + "cfs: Vec - Track CFs being written (for fsync decision)" + ], + "methods": [ + "new() -> Self - Create empty batch", + "put(cf: ColumnFamily, key: &[u8], value: &[u8]) -> &mut Self - Add put operation", + "delete(cf: ColumnFamily, key: &[u8]) -> &mut Self - Add delete operation", + "clear(&mut self) - Reset batch", + "is_empty(&self) -> bool - Check if batch has operations", + "requires_fsync(&self) -> bool - True if any CF requires fsync" + ], + "atomicity_guarantee": "All operations succeed or all fail; no partial writes visible" + }, + "StorageIterator": { + "description": "Wrapper around RocksDB iterator with consistent semantics", + "fields": [ + "inner: rocksdb::DBIterator - Underlying RocksDB iterator", + "cf: ColumnFamily - Column family being iterated" + ], + "methods": [ + "seek(&mut self, key: &[u8]) - Position iterator at key or next greater", + "seek_to_first(&mut self) - Position at start of CF", + "seek_to_last(&mut self) - Position at end of CF", + "next(&mut self) -> Option, Box<[u8]>)>> - Advance forward", + "prev(&mut self) -> Option, Box<[u8]>)>> - Advance backward", + "valid(&self) -> bool - Check if iterator positioned at valid entry" + ], + "snapshot_isolation": "Iterator sees consistent snapshot of data at creation time" + }, + "IteratorMode": { + "description": "Iterator positioning mode", + "variants": [ + "Start - From first key in CF", + "End - From last key in CF", + "From(Vec, Direction) - From specific key in direction" + ] + }, + "StorageMetrics": { + "description": "Runtime metrics for monitoring", + "fields": [ + "db_size_bytes: u64 - Total disk usage across all CFs", + "num_keys: HashMap - Key count per CF", + "last_snapshot_duration_ms: u64 - Most recent snapshot creation time", + "write_ops_total: u64 - Cumulative write operations", + "read_ops_total: u64 - Cumulative read operations", + "bytes_written: u64 - Total bytes written", + "bytes_read: u64 - Total bytes read" + ], + "methods": [ + "new() -> Self - Initialize with zeros", + "update_from_db(&mut self, db: &DB) - Refresh from RocksDB internal stats" + ], + "update_frequency": "On-demand via metrics() call; no background thread needed in Phase 1" + }, + "StorageError": { + "description": "Rich error context using thiserror", + "variants": [ + "RocksDb(rocksdb::Error) - Underlying RocksDB errors", + "Io(std::io::Error) - File system errors", + "Serialization(bincode::Error) - Serialization failures", + "ColumnFamilyNotFound(String) - Invalid CF name", + "InvalidLogIndex { expected: u64, got: u64 } - Log gap detected", + "SnapshotFailed { path: PathBuf, reason: String } - Checkpoint creation failed", + "CorruptedData { cf: String, key: Vec, reason: String } - Data validation failed", + "VersionMismatch { expected: u8, got: u8 } - Schema version incompatibility" + ], + "propagation_strategy": "Use Result throughout; convert from underlying errors with #[from]", + "context_preservation": "Each variant includes enough context to debug without source code access" + } + }, + "persistence_design": { + "rocksdb_configuration": { + "write_ahead_log": { + "enabled": true, + "sync_mode": "Per-write sync for *_raft_state CFs, async for others", + "purpose": "Durability guarantee for Raft hard state updates" + }, + "memtable_settings": { + "write_buffer_size": "64MB per CF", + "max_write_buffer_number": 3, + "rationale": "Balance memory usage vs write amplification; 64MB * 3 * 6 CFs = ~1.1GB max memory" + }, + "compaction_settings": { + "style": "Level compaction for all CFs (better space efficiency than universal)", + "level0_trigger": "4 files (triggers L0->L1 compaction)", + "target_file_size": "64MB SST files", + "max_bytes_for_level_base": "256MB (L1 size)", + "rationale": "Standard tiered compaction; optimize for read performance over write throughput" + }, + "compression": { + "type": "Lz4 (fast decompression, good ratio)", + "per_level": "None for L0/L1 (hot data), Lz4 for L2+", + "rationale": "Balance CPU usage vs disk space; Raft logs compress well" + }, + "bloom_filters": { + "enabled": "Only for data_kv CF", + "bits_per_key": 10, + "purpose": "Reduce disk reads for GET operations; not needed for sequential Raft log access" + }, + "block_cache": { + "shared": true, + "size": "256MB shared across all CFs", + "purpose": "Cache frequently accessed blocks; let OS page cache handle rest" + } + }, + "column_family_setup": { + "initialization": "Open DB with all 6 CFs; fail-fast if any CF missing (prevents data loss)", + "creation": "Create all CFs atomically on first startup (bootstrap mode)", + "options_per_cf": { + "system_raft_log": { + "optimize_for": "Sequential writes and range scans", + "prefix_extractor": "None (access is by exact log:{index} lookup or range scan)", + "compaction": "Level style, aggressive (keep log size small)" + }, + "system_raft_state": { + "optimize_for": "Single-key updates with fsync", + "size": "Always <1KB (single hard state entry)", + "compaction": "Disabled (too small to matter)" + }, + "system_data": { + "optimize_for": "Small number of keys (<10), infrequent updates", + "compaction": "Disabled (bounded size ~100KB)" + }, + "data_raft_log": { + "optimize_for": "High-throughput sequential writes and range scans", + "prefix_extractor": "None", + "compaction": "Level style, moderate (balance log size vs compaction cost)" + }, + "data_raft_state": { + "optimize_for": "Single-key updates with fsync", + "size": "Always <1KB", + "compaction": "Disabled" + }, + "data_kv": { + "optimize_for": "Random reads and writes, high key count", + "prefix_extractor": "4-byte hash prefix for bloom filters", + "bloom_filter": "Enabled (10 bits per key)", + "compaction": "Level style, standard settings" + } + } + }, + "serialization_strategy": { + "format": "bincode for all Rust structs", + "rationale": "Faster than JSON, more compact than protobuf for Rust-to-Rust, no schema required", + "version_handling": "All structs have `version: u8` as first field; checked on deserialization", + "migration_path": "If version < CURRENT_VERSION, run migration; if version > CURRENT_VERSION, refuse to start", + "key_formats": { + "system_raft_log": "String keys: 'log:{index}' (e.g., 'log:142')", + "system_raft_state": "String key: 'state' (single entry)", + "system_data": "String keys: 'membership', 'shardmap'", + "data_raft_log": "String keys: 'log:{index}'", + "data_raft_state": "String key: 'state'", + "data_kv": "Raw byte keys (user-provided, no encoding)" + }, + "value_formats": { + "raft_log": "bincode::serialize(&VersionedLogEntry)", + "raft_state": "bincode::serialize(&RaftHardState)", + "system_data": "bincode::serialize(&ClusterMembership | &ShardMap)", + "data_kv": "bincode::serialize(&StoredValue)" + } + }, + "durability_strategy": { + "fsync_policy": { + "raft_state_cfs": "Synchronous fsync before returning from put() - critical for Raft safety", + "other_cfs": "Async WAL writes - rely on WAL for durability, fsync happens in background", + "batch_writes": "If batch touches any raft_state CF, fsync entire batch" + }, + "wal_configuration": { + "enabled": true, + "sync_mode": "Per-write for state CFs, batch for others", + "wal_dir": "Same as data_dir (simplify Phase 1; can separate in Phase 4 for performance)", + "wal_size_limit": "64MB (triggers rotation)" + }, + "crash_recovery": "RocksDB handles replay from WAL automatically; our code just opens DB and validates schema versions", + "checkpoint_atomicity": "RocksDB checkpoint creates hard links atomically; either full checkpoint exists or none" + } + }, + "api_surface": { + "crud_operations": [ + { + "method": "get(cf: ColumnFamily, key: &[u8]) -> Result>>", + "purpose": "Point read from column family", + "behavior": "Returns None if key doesn't exist; Some(value) if found", + "error_cases": "RocksDb error, Io error", + "performance": "O(log n) with bloom filter optimization for data_kv" + }, + { + "method": "put(cf: ColumnFamily, key: &[u8], value: &[u8]) -> Result<()>", + "purpose": "Write key-value pair to column family", + "behavior": "Overwrites if key exists; creates if new; fsync if CF requires it", + "error_cases": "RocksDb error, Io error", + "performance": "O(1) amortized; may trigger compaction" + }, + { + "method": "delete(cf: ColumnFamily, key: &[u8]) -> Result<()>", + "purpose": "Remove key from column family", + "behavior": "No-op if key doesn't exist; fsync if CF requires it", + "error_cases": "RocksDb error, Io error", + "performance": "O(1) amortized; tombstone written to LSM tree" + }, + { + "method": "exists(cf: ColumnFamily, key: &[u8]) -> Result", + "purpose": "Check key existence without reading value", + "behavior": "Returns true if key exists, false otherwise", + "error_cases": "RocksDb error, Io error", + "performance": "O(log n); faster than get() since value not returned" + } + ], + "raft_log_operations": [ + { + "method": "append_log_entry(cf: ColumnFamily, index: u64, entry: &[u8]) -> Result<()>", + "purpose": "Optimized append to Raft log", + "behavior": "Write entry with key 'log:{index}'; validates no gaps in log indices", + "error_cases": "InvalidLogIndex if gap detected, RocksDb error", + "performance": "O(1) amortized; sequential write pattern" + }, + { + "method": "get_log_range(cf: ColumnFamily, start: u64, end: u64) -> Result>>", + "purpose": "Batch read of log entries for replication", + "behavior": "Returns entries [start, end) in order; empty vec if none found", + "error_cases": "RocksDb error, Serialization error", + "performance": "O(m) where m = (end - start); sequential scan with iterator" + }, + { + "method": "truncate_log_before(cf: ColumnFamily, index: u64) -> Result<()>", + "purpose": "Delete log entries before index (for compaction)", + "behavior": "Batch delete all entries with log_index < index", + "error_cases": "RocksDb error", + "performance": "O(k) where k = number of entries deleted; uses WriteBatch for atomicity" + }, + { + "method": "get_last_log_index(cf: ColumnFamily) -> Result>", + "purpose": "Find highest log index without scanning entire log", + "behavior": "Seek to end of CF, read last key, parse index", + "error_cases": "RocksDb error, parse error if log corrupted", + "performance": "O(1) with reverse iterator" + } + ], + "snapshot_operations": [ + { + "method": "create_snapshot(path: &Path) -> Result", + "purpose": "Create atomic checkpoint for log compaction", + "behavior": "RocksDB checkpoint to path; return metadata with timestamp, size, last_index", + "error_cases": "SnapshotFailed if checkpoint fails, Io error", + "performance": "O(1) - uses hard links, no data copying; ~10ms for Phase 1 dataset", + "atomicity": "All CFs snapshotted atomically at same point-in-time" + }, + { + "method": "restore_snapshot(path: &Path) -> Result<()>", + "purpose": "Restore DB from checkpoint (for new nodes joining cluster)", + "behavior": "Close current DB, copy checkpoint files to data_dir, reopen DB", + "error_cases": "Io error, RocksDb error, VersionMismatch if snapshot too new", + "performance": "O(n) where n = snapshot size; requires full data copy", + "validation": "Check all CF schema versions before accepting snapshot" + }, + { + "method": "validate_snapshot(path: &Path) -> Result", + "purpose": "Pre-validate snapshot before restore (avoid partial restore failures)", + "behavior": "Open snapshot read-only, check versions, extract metadata, close", + "error_cases": "SnapshotFailed, VersionMismatch, CorruptedData", + "performance": "O(1) - only reads metadata, not full data scan" + } + ], + "batch_operations": [ + { + "method": "batch_write(batch: WriteBatch) -> Result<()>", + "purpose": "Atomic multi-CF write operation", + "behavior": "All operations succeed or all fail; fsync if any CF requires it", + "error_cases": "RocksDb error, Io error", + "performance": "O(k) where k = number of operations in batch; amortizes fsync cost", + "atomicity": "ACID atomic - no partial writes visible to readers" + } + ], + "utility_operations": [ + { + "method": "iterator(cf: ColumnFamily, mode: IteratorMode) -> Result", + "purpose": "Create iterator for range queries", + "behavior": "Return iterator positioned per mode; sees snapshot-isolated view", + "error_cases": "RocksDb error", + "performance": "O(1) to create; O(k) to scan k entries", + "snapshot_guarantee": "Iterator sees consistent view even if writes happen during scan" + }, + { + "method": "metrics(&self) -> StorageMetrics", + "purpose": "Get current storage metrics for monitoring", + "behavior": "Return snapshot of metrics; refresh from RocksDB internal stats", + "error_cases": "None - returns default if stats unavailable", + "performance": "O(1) - reads cached values" + }, + { + "method": "sync(&self) -> Result<()>", + "purpose": "Force fsync of WAL (for testing and manual durability)", + "behavior": "Block until all pending writes are durable", + "error_cases": "Io error", + "performance": "O(1) but blocks until fsync completes (~1-5ms on SSD)" + }, + { + "method": "compact_range(cf: ColumnFamily, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()>", + "purpose": "Manual compaction trigger (for testing and maintenance)", + "behavior": "Force compaction of key range in CF; blocks until complete", + "error_cases": "RocksDb error", + "performance": "O(n) where n = range size; expensive, use sparingly" + }, + { + "method": "close(self) -> Result<()>", + "purpose": "Graceful shutdown of storage", + "behavior": "Flush memtables, close DB, release file handles", + "error_cases": "Io error", + "performance": "O(1) - RocksDB handles internal cleanup" + } + ] + }, + "error_handling": { + "error_types": [ + "RocksDb - Wrapper for rocksdb::Error (disk full, corruption, etc.)", + "Io - File system errors during snapshot operations", + "Serialization - bincode errors when deserializing persisted structs", + "ColumnFamilyNotFound - Attempt to access non-existent CF (programming error)", + "InvalidLogIndex - Log gap detected (Raft safety violation)", + "SnapshotFailed - Checkpoint creation/restoration failed", + "CorruptedData - Schema version mismatch or invalid data format", + "VersionMismatch - Attempted to load newer schema version than supported" + ], + "propagation_strategy": "Use Result for all fallible operations; no panics except for programming errors (e.g., invalid CF enum)", + "context_preservation": "Each error variant includes context (CF name, key, expected vs actual values) for debugging; use thiserror #[error] for formatting", + "recovery_strategy": { + "RocksDb": "Propagate to caller (raft crate decides whether to retry, fail, or panic)", + "Io": "Propagate to caller (likely requires node restart)", + "Serialization": "Fail-fast - indicates data corruption or version mismatch", + "ColumnFamilyNotFound": "Panic - programming error, not runtime error", + "InvalidLogIndex": "Propagate to raft crate - indicates need for snapshot from leader", + "SnapshotFailed": "Propagate to caller - may retry or request from different node", + "CorruptedData": "Fail-fast - cannot continue with corrupted data", + "VersionMismatch": "Fail-fast on startup - requires upgrade" + }, + "logging_strategy": "Use tracing::{error, warn, debug} for internal events; caller logs errors returned from API" + }, + "performance_design": { + "latency_optimization": [ + "Target: <1ms p99 for get/put operations", + "Bloom filters on data_kv CF reduce unnecessary disk reads", + "Shared block cache (256MB) reduces read latency for hot keys", + "Write buffer (64MB per CF) batches writes before flushing to disk", + "Fsync only for raft_state CFs - most writes are async", + "Direct access to RocksDB (no additional serialization layers)", + "Use &[u8] throughout API to avoid unnecessary copies" + ], + "batch_strategy": { + "motivation": "Amortize fsync cost across multiple operations", + "implementation": "WriteBatch accumulates puts/deletes, single fsync at commit", + "use_cases": [ + "Raft log replication: batch multiple log entries", + "State machine application: batch user KV writes + Raft state update", + "Log compaction: batch truncation of old log entries" + ], + "size_limits": "No hard limit in Phase 1; consider 10MB batches to avoid memory spikes in Phase 2+", + "atomicity_guarantee": "RocksDB WriteBatch ensures all-or-nothing semantics" + }, + "iterator_implementation": { + "snapshot_isolation": "Iterator captures DB snapshot at creation; sees consistent view", + "direction_support": "Forward and backward iteration via next()/prev()", + "seek_operations": "seek(), seek_to_first(), seek_to_last() for positioning", + "memory_usage": "Iterator holds reference to snapshot; bounded by RocksDB internal buffers", + "cleanup": "Iterator automatically released on drop; snapshot freed", + "use_cases": [ + "Log range scan: iterate log:{start} to log:{end}", + "Full KV scan: iterate entire data_kv CF (for snapshots)", + "Prefix scan: seek to key prefix, iterate while matches" + ] + }, + "snapshot_efficiency": { + "checkpoint_mechanism": "RocksDB checkpoint uses hard links (no data copy)", + "creation_time": "O(1) - typically <10ms for datasets up to 10GB", + "disk_space": "Zero additional space initially; diverges as original DB changes", + "restoration_time": "O(n) - must copy all files to data_dir; ~1-2s per GB", + "validation": "Schema version check on restore to detect incompatible snapshots", + "concurrency": "Checkpoint creation doesn't block reads/writes", + "cleanup": "Old checkpoints must be manually deleted (defer to raft crate to manage lifecycle)" + }, + "memory_considerations": { + "write_buffers": "64MB * 3 buffers * 6 CFs = ~1.1GB max memtable memory", + "block_cache": "256MB shared across all CFs", + "iterator_snapshots": "Bounded by number of active iterators * block cache overhead", + "total_estimate": "~1.5GB memory for storage layer in steady state", + "backpressure": "RocksDB stalls writes if memtables full (prevents OOM)" + }, + "disk_io_patterns": { + "raft_log": "Sequential writes, occasional sequential reads (for replication)", + "raft_state": "Random writes with fsync, infrequent reads (on restart)", + "data_kv": "Random reads and writes, high IOPS workload", + "compaction": "Background sequential reads + writes, throttled to avoid impacting foreground", + "wal": "Sequential writes, flushed per-write for state CFs" + } + }, + "integration_points": { + "raft_crate_usage": [ + "RaftStorage struct (in raft crate) will own Storage instance", + "Implements raft::Storage trait by delegating to our Storage methods", + "Handles log entry serialization/deserialization (VersionedLogEntry -> bytes)", + "Maps Raft operations to appropriate column families", + "Enforces Raft invariants (no log gaps, hard state always persisted)" + ], + "common_crate_dependencies": [ + "Import error types from common::errors (if we define base error types there)", + "Use data structure definitions: VersionedLogEntry, RaftHardState, etc.", + "Share StorageMetrics type if used by multiple crates", + "Constants: CURRENT_VERSION, column family names" + ], + "phase_2_extensions": [ + "Add ColumnFamily::ShardRaftLog(shard_id) and ShardRaftState(shard_id) variants", + "Dynamic CF creation for new shards (may require DB restart in Phase 2)", + "Per-shard metrics tracking", + "Shard-specific compaction policies" + ] + } + }, + "design": { + "component_architecture": { + "pattern": "Storage Abstraction Layer (NOT Router → Service → Repository - this is persistence only)", + "module_structure": [ + "lib.rs - Public API, Storage struct, re-exports", + "column_family.rs - ColumnFamily enum with CF metadata", + "batch.rs - WriteBatch builder for atomic operations", + "iterator.rs - StorageIterator with snapshot isolation", + "error.rs - StorageError with thiserror", + "metrics.rs - StorageMetrics tracking", + "snapshot.rs - Checkpoint creation/restoration", + "options.rs - StorageOptions and CFOptions configuration" + ], + "component_interaction": { + "Storage": "Central hub - owns Arc, manages CF handles, provides all operations", + "WriteBatch": "Helper - builds atomic multi-CF writes, used by Storage.batch_write()", + "StorageIterator": "Helper - wraps RocksDB iterator with snapshot isolation", + "ColumnFamily": "Type-safe enum - prevents CF name typos at compile time", + "StorageMetrics": "Read-only view - exposed via Storage.metrics()", + "StorageOptions": "Initialization config - consumed by Storage::new()" + }, + "dependencies": { + "internal": "Storage owns all other components (WriteBatch, Iterator created on-demand)", + "external": "RocksDB (via Arc), bincode (serialization), common crate (data types)" + } + }, + "implementation_design": { + "storage_struct": { + "definition": "pub struct Storage { db: Arc, cf_handles: HashMap>, metrics: Arc>, config: StorageOptions }", + "key_methods": [ + "new(options: StorageOptions) -> Result - Open or create RocksDB with all 6 CFs", + "get(&self, cf: ColumnFamily, key: &[u8]) -> Result>>", + "put(&self, cf: ColumnFamily, key: &[u8], value: &[u8]) -> Result<()>", + "batch_write(&self, batch: WriteBatch) -> Result<()>", + "create_snapshot(&self, path: &Path) -> Result" + ], + "concurrency": "Arc enables safe sharing across threads; RocksDB handles internal locking", + "lifecycle": "Created once at node startup, shared across Raft groups, closed at shutdown" + }, + "write_batch_impl": { + "definition": "pub struct WriteBatch { inner: rocksdb::WriteBatch, cfs: Vec }", + "builder_pattern": "new() -> put() -> put() -> delete() -> requires_fsync() -> storage.batch_write(batch)", + "atomicity": "RocksDB WriteBatch guarantees all operations commit together or none", + "fsync_decision": "requires_fsync() returns true if any CF in cfs Vec requires fsync" + }, + "iterator_impl": { + "definition": "pub struct StorageIterator<'a> { inner: DBIterator<'a>, cf: ColumnFamily }", + "snapshot_isolation": "Iterator created with DB snapshot - sees consistent view", + "methods": "next(), prev(), seek(), seek_to_first(), seek_to_last(), valid()", + "cleanup": "Snapshot automatically released on Iterator drop" + }, + "error_handling_flow": { + "result_type": "pub type Result = std::result::Result", + "error_conversion": "Use #[from] to auto-convert rocksdb::Error, std::io::Error, bincode::Error", + "context_addition": "Wrap errors with .map_err() to add CF name, key, operation context", + "propagation": "All public methods return Result; caller decides retry/fail/panic" + } + }, + "data_flows": { + "get_operation": [ + "1. Client: storage.get(ColumnFamily::DataKv, b\"foo\")", + "2. Storage: Look up CF handle from cf_handles HashMap", + "3. Storage: db.get_cf(cf_handle, b\"foo\") -> Result>>", + "4. Storage: Update metrics.read_ops_total += 1", + "5. Storage: Return Ok(Some(value)) or Ok(None)" + ], + "put_operation": [ + "1. Client: storage.put(ColumnFamily::DataRaftState, b\"state\", bytes)", + "2. Storage: Check cf.requires_fsync() -> true for raft_state CFs", + "3. Storage: Create WriteOptions with sync=true", + "4. Storage: db.put_cf_opt(cf_handle, b\"state\", bytes, write_opts)", + "5. RocksDB: Append to WAL, fsync WAL to disk (blocks ~1-5ms)", + "6. RocksDB: Insert into memtable", + "7. Storage: Update metrics.write_ops_total += 1", + "8. Storage: Return Ok(())" + ], + "batch_write_operation": [ + "1. Client: let mut batch = WriteBatch::new()", + "2. Client: batch.put(ColumnFamily::DataKv, b\"key1\", b\"val1\")", + "3. Client: batch.put(ColumnFamily::DataRaftState, b\"state\", bytes)", + "4. Client: storage.batch_write(batch)", + "5. Storage: Check batch.requires_fsync() -> true (DataRaftState touched)", + "6. Storage: Create WriteOptions with sync=true", + "7. Storage: db.write_opt(batch.inner, write_opts)", + "8. RocksDB: Apply all operations atomically, fsync WAL", + "9. Storage: Update metrics", + "10. Storage: Return Ok(())" + ], + "snapshot_creation": [ + "1. Client: storage.create_snapshot(Path::new(\"/data/snapshots/snap-20250125\"))", + "2. Storage: Create Checkpoint object from db", + "3. Storage: checkpoint.create_checkpoint(path)?", + "4. RocksDB: Create hard links to all SST files atomically (~10ms)", + "5. Storage: Read db.latest_sequence_number() for metadata", + "6. Storage: Stat checkpoint directory for size_bytes", + "7. Storage: Create SnapshotMetadata { last_included_index, last_included_term, created_at, size_bytes }", + "8. Storage: Update metrics.last_snapshot_duration_ms", + "9. Storage: Return Ok(metadata)" + ], + "raft_integration_flow": [ + "1. Raft crate: RaftStorage owns Storage instance", + "2. raft-rs calls: storage_trait.append(entries)", + "3. RaftStorage: Serialize entries to VersionedLogEntry", + "4. RaftStorage: Call storage.append_log_entry(ColumnFamily::DataRaftLog, index, bytes)", + "5. Storage: Validate no gaps in log indices", + "6. Storage: db.put_cf(data_raft_log, b\"log:{index}\", bytes)", + "7. Storage: Return Ok(())", + "8. RaftStorage: Return success to raft-rs" + ] + }, + "module_organization": { + "source_files": { + "lib.rs": "Public API surface, Storage struct impl, re-exports (Storage, ColumnFamily, WriteBatch, etc.)", + "column_family.rs": "ColumnFamily enum, as_str(), requires_fsync(), default_options(), all() methods", + "batch.rs": "WriteBatch builder with put/delete/clear/is_empty/requires_fsync", + "iterator.rs": "StorageIterator wrapper with next/prev/seek methods, IteratorMode enum", + "error.rs": "StorageError enum with thiserror derives, Result type alias", + "metrics.rs": "StorageMetrics struct with new() and update_from_db() methods", + "snapshot.rs": "Helper functions for checkpoint creation/restoration/validation", + "options.rs": "StorageOptions and CFOptions configuration structs with builders" + }, + "public_api": [ + "Storage - Main struct with all public methods", + "ColumnFamily - Enum of 6 CFs", + "WriteBatch - Builder for atomic writes", + "StorageIterator - Iterator wrapper", + "IteratorMode - Iterator positioning enum", + "StorageOptions, CFOptions - Configuration", + "StorageMetrics - Metrics snapshot", + "StorageError - Error type", + "Result - Type alias for Result" + ], + "internal_helpers": [ + "CF handle caching in HashMap (private field)", + "RocksDB initialization logic in Storage::new()", + "WriteOptions creation based on fsync requirements", + "Metrics update helper methods", + "Key formatting helpers (e.g., format_log_key(index) -> String)" + ], + "test_structure": { + "unit_tests": "Each module has #[cfg(test)] mod tests { ... } for internal functions", + "integration_tests": "tests/integration_tests.rs - Full Storage workflows with temporary RocksDB", + "test_helpers": "tests/common/mod.rs - Shared test utilities (temp dir, sample data generation)", + "property_tests": "tests/property_tests.rs - proptest for serialization roundtrip, batch atomicity" + } + }, + "integration_design": { + "raft_crate_interface": { + "wrapper_struct": "RaftStorage { storage: Storage, shard_id: u64 }", + "trait_implementation": "impl raft::Storage for RaftStorage", + "method_mapping": { + "raft::Storage::append": "storage.append_log_entry() with serialization", + "raft::Storage::entries": "storage.get_log_range() with deserialization", + "raft::Storage::snapshot": "storage.create_snapshot() + metadata", + "raft::Storage::hard_state": "storage.get() on DataRaftState CF" + }, + "serialization_responsibility": "RaftStorage handles Entry <-> VersionedLogEntry conversion; Storage only sees bytes", + "cf_selection": "RaftStorage selects appropriate CF (SystemRaftLog vs DataRaftLog) based on context" + }, + "common_crate_dependencies": { + "data_structures": [ + "VersionedLogEntry - Defined in common, serialized by raft crate, stored as bytes by storage crate", + "RaftHardState - Defined in common, serialized by raft crate", + "StoredValue - Defined in common, serialized by kv crate", + "ClusterMembership, ShardMap - Defined in common, serialized by seshat binary" + ], + "error_types": "If common defines base Error enum, storage::StorageError could wrap it; otherwise independent", + "constants": "CURRENT_VERSION, CF names as constants if needed for validation", + "type_aliases": "NodeId, Term, LogIndex used for documentation but storage sees u64" + }, + "data_structure_placement": { + "storage_crate": [ + "Storage, WriteBatch, StorageIterator - Storage abstraction types", + "ColumnFamily, IteratorMode - Storage-specific enums", + "StorageOptions, CFOptions - Configuration types", + "StorageMetrics - Metrics type", + "StorageError - Storage-specific errors" + ], + "common_crate": [ + "VersionedLogEntry, RaftHardState - Raft consensus data", + "StoredValue - User KV data wrapper", + "ClusterMembership, ShardMap - Cluster metadata", + "NodeInfo, ShardInfo - Supporting types", + "SnapshotMetadata - Shared between storage and raft" + ], + "rationale": "Storage crate is pure persistence abstraction; common crate holds domain data structures shared across crates" + } + } + } +} diff --git a/docs/specs/rocksdb/design.md b/docs/specs/rocksdb/design.md new file mode 100644 index 0000000..f37b0c8 --- /dev/null +++ b/docs/specs/rocksdb/design.md @@ -0,0 +1,915 @@ +# RocksDB Storage Layer Technical Design + +## Architecture Pattern + +### Storage Abstraction Layer + +This is a **pure persistence layer** - NOT the standard Router → Service → Repository pattern used for web APIs. The storage crate provides a low-level abstraction over RocksDB with no business logic. + +``` +┌─────────────────────────────────────┐ +│ Raft Crate (OpenRaftMemStorage) │ +│ - Implements openraft storage traits │ +│ - RaftLogReader, RaftSnapshotBuilder │ +│ - RaftStorage (openraft version) │ +└─────────────────────────────────────┘ + │ + │ Uses Storage methods + ▼ +┌─────────────────────────────────────┐ +│ Storage Crate (Storage struct) │ +│ - Column family management │ +│ - Atomic batch writes │ +│ - Snapshot creation/restoration │ +│ - Thread-safe RocksDB access │ +└─────────────────────────────────────┘ + │ + │ Wraps RocksDB API + ▼ +┌─────────────────────────────────────┐ +│ RocksDB (Arc) │ +│ - 6 column families │ +│ - WAL with fsync control │ +│ - LSM tree compaction │ +└─────────────────────────────────────┘ +``` + +**Key Principle**: Storage layer stores bytes as directed - it has NO understanding of Raft semantics, business logic, or protocol parsing. + +## Component Architecture + +### Core Components + +#### Storage (lib.rs) +**Main struct managing RocksDB instance and column families** + +```rust +pub struct Storage { + db: Arc, + cf_handles: HashMap>, + metrics: Arc>, + config: StorageOptions, +} + +impl Storage { + // Initialization + pub fn new(options: StorageOptions) -> Result; + + // CRUD operations + pub fn get(&self, cf: ColumnFamily, key: &[u8]) -> Result>>; + pub fn put(&self, cf: ColumnFamily, key: &[u8], value: &[u8]) -> Result<()>; + pub fn delete(&self, cf: ColumnFamily, key: &[u8]) -> Result<()>; + pub fn exists(&self, cf: ColumnFamily, key: &[u8]) -> Result; + + // Batch operations + pub fn batch_write(&self, batch: WriteBatch) -> Result<()>; + + // Raft log operations + pub fn append_log_entry(&self, cf: ColumnFamily, index: u64, entry: &[u8]) -> Result<()>; + pub fn get_log_range(&self, cf: ColumnFamily, start: u64, end: u64) -> Result>>; + pub fn truncate_log_before(&self, cf: ColumnFamily, index: u64) -> Result<()>; + pub fn get_last_log_index(&self, cf: ColumnFamily) -> Result>; + + // Snapshot operations + pub fn create_snapshot(&self, path: &Path) -> Result; + pub fn restore_snapshot(&self, path: &Path) -> Result<()>; + pub fn validate_snapshot(&self, path: &Path) -> Result; + + // Utilities + pub fn iterator(&self, cf: ColumnFamily, mode: IteratorMode) -> Result; + pub fn metrics(&self) -> StorageMetrics; + pub fn sync(&self) -> Result<()>; + pub fn compact_range(&self, cf: ColumnFamily, start: Option<&[u8]>, end: Option<&[u8]>) -> Result<()>; + pub fn close(self) -> Result<()>; +} +``` + +**Thread Safety**: `Arc` enables safe multi-threaded access. RocksDB handles internal synchronization, so Storage can be safely cloned and shared across threads. + +**Lifecycle**: Created once at node startup, shared across Raft groups (system and data shards), closed at graceful shutdown. + +#### ColumnFamily (column_family.rs) +**Type-safe enum for 6 column families** + +```rust +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ColumnFamily { + SystemRaftLog, // System group Raft log entries + SystemRaftState, // System group hard state (requires fsync) + SystemData, // Cluster metadata (membership, shardmap) + DataRaftLog, // Data shard Raft log entries + DataRaftState, // Data shard hard state (requires fsync) + DataKv, // User key-value data +} + +impl ColumnFamily { + pub fn as_str(&self) -> &'static str; + pub fn all() -> [ColumnFamily; 6]; + pub fn requires_fsync(&self) -> bool; // True for *_raft_state CFs + pub fn default_options(&self) -> CFOptions; +} +``` + +**Design Rationale**: Enum prevents CF name typos at compile time and enables CF-specific behavior (fsync requirements, optimization profiles). + +#### WriteBatch (batch.rs) +**Builder pattern for atomic multi-CF write operations** + +```rust +pub struct WriteBatch { + inner: rocksdb::WriteBatch, + cfs: Vec, +} + +impl WriteBatch { + pub fn new() -> Self; + pub fn put(&mut self, cf: ColumnFamily, key: &[u8], value: &[u8]) -> &mut Self; + pub fn delete(&mut self, cf: ColumnFamily, key: &[u8]) -> &mut Self; + pub fn clear(&mut self); + pub fn is_empty(&self) -> bool; + pub fn requires_fsync(&self) -> bool; // True if any CF requires fsync +} +``` + +**Atomicity Guarantee**: All operations succeed or all fail - no partial writes visible to readers. + +**Usage Example**: +```rust +let mut batch = WriteBatch::new(); +batch + .put(ColumnFamily::DataKv, b"key1", b"value1") + .put(ColumnFamily::DataKv, b"key2", b"value2") + .put(ColumnFamily::DataRaftState, b"state", serialized_state); + +storage.batch_write(batch)?; // Atomic commit with fsync +``` + +#### StorageIterator (iterator.rs) +**Iterator with snapshot isolation for range queries** + +```rust +pub struct StorageIterator<'a> { + inner: DBIterator<'a>, + cf: ColumnFamily, +} + +pub enum IteratorMode { + Start, // From first key in CF + End, // From last key in CF + From(Vec, Direction), // From specific key +} + +impl<'a> StorageIterator<'a> { + pub fn seek(&mut self, key: &[u8]); + pub fn seek_to_first(&mut self); + pub fn seek_to_last(&mut self); + pub fn next(&mut self) -> Option, Box<[u8]>)>>; + pub fn prev(&mut self) -> Option, Box<[u8]>)>>; + pub fn valid(&self) -> bool; +} +``` + +**Snapshot Isolation**: Iterator captures DB snapshot at creation time - sees consistent view even if writes happen during iteration. + +#### StorageOptions (options.rs) +**Configuration for RocksDB initialization** + +```rust +pub struct StorageOptions { + pub data_dir: PathBuf, + pub create_if_missing: bool, // True for bootstrap, false for join + pub compression: CompressionType, // Lz4 for Phase 1 + pub write_buffer_size_mb: usize, // 64MB default per CF + pub max_write_buffer_number: usize, // 3 memtables default + pub target_file_size_mb: usize, // 64MB SST files + pub max_open_files: i32, // -1 for unlimited + pub enable_statistics: bool, // True for Phase 4 observability + pub cf_options: HashMap, +} + +pub struct CFOptions { + pub compaction_style: DBCompactionStyle, + pub disable_auto_compactions: bool, + pub level0_file_num_compaction_trigger: i32, + pub write_buffer_size: Option, + pub prefix_extractor: Option, +} + +impl Default for StorageOptions { + fn default() -> Self { + Self { + data_dir: PathBuf::from("./data/rocksdb"), + create_if_missing: true, + compression: CompressionType::Lz4, + write_buffer_size_mb: 64, + max_write_buffer_number: 3, + target_file_size_mb: 64, + max_open_files: -1, // Unlimited + enable_statistics: false, + cf_options: Self::default_cf_options(), + } + } +} + +impl StorageOptions { + /// Create options with custom data directory, using defaults for all other settings. + pub fn with_data_dir(path: PathBuf) -> Self { + Self { + data_dir: path, + ..Default::default() + } + } + + /// Validate configuration values. + /// + /// # Errors + /// - write_buffer_size_mb: Must be 1-1024 MB + /// - max_write_buffer_number: Must be 2-10 + /// - target_file_size_mb: Must be 1-1024 MB + pub fn validate(&self) -> Result<()> { + if self.write_buffer_size_mb < 1 || self.write_buffer_size_mb > 1024 { + return Err(StorageError::InvalidConfig { + field: "write_buffer_size_mb".to_string(), + reason: format!("Must be 1-1024, got {}", self.write_buffer_size_mb), + }); + } + + if self.max_write_buffer_number < 2 || self.max_write_buffer_number > 10 { + return Err(StorageError::InvalidConfig { + field: "max_write_buffer_number".to_string(), + reason: format!("Must be 2-10, got {}", self.max_write_buffer_number), + }); + } + + if self.target_file_size_mb < 1 || self.target_file_size_mb > 1024 { + return Err(StorageError::InvalidConfig { + field: "target_file_size_mb".to_string(), + reason: format!("Must be 1-1024, got {}", self.target_file_size_mb), + }); + } + + Ok(()) + } + + /// Default column family options optimized per CF type. + fn default_cf_options() -> HashMap { + use ColumnFamily::*; + + let mut opts = HashMap::new(); + + // Raft log CFs: Sequential writes, aggressive compaction + let raft_log_opts = CFOptions { + compaction_style: DBCompactionStyle::Level, + disable_auto_compactions: false, + level0_file_num_compaction_trigger: 2, // Aggressive compaction + write_buffer_size: None, // Use global default + prefix_extractor: None, // Sequential access, no prefix needed + }; + opts.insert(SystemRaftLog, raft_log_opts.clone()); + opts.insert(DataRaftLog, raft_log_opts); + + // Raft state CFs: Tiny, rarely compacted + let raft_state_opts = CFOptions { + compaction_style: DBCompactionStyle::Level, + disable_auto_compactions: true, // Manual compaction only + level0_file_num_compaction_trigger: 10, // Never triggers + write_buffer_size: Some(4 * 1024 * 1024), // 4MB (tiny) + prefix_extractor: None, + }; + opts.insert(SystemRaftState, raft_state_opts.clone()); + opts.insert(DataRaftState, raft_state_opts); + + // System data CF: Small, infrequent updates + opts.insert(SystemData, CFOptions { + compaction_style: DBCompactionStyle::Level, + disable_auto_compactions: true, + level0_file_num_compaction_trigger: 10, + write_buffer_size: Some(8 * 1024 * 1024), // 8MB + prefix_extractor: None, + }); + + // Data KV CF: Random access, bloom filters, high throughput + opts.insert(DataKv, CFOptions { + compaction_style: DBCompactionStyle::Level, + disable_auto_compactions: false, + level0_file_num_compaction_trigger: 4, // Moderate compaction + write_buffer_size: None, // Use global default (64MB) + // 4-byte prefix extractor for bloom filter optimization + prefix_extractor: Some(SliceTransform::create_fixed_prefix(4)), + }); + + opts + } +} +``` + +#### StorageMetrics (metrics.rs) +**Runtime metrics for monitoring** + +```rust +#[derive(Debug, Clone)] +pub struct StorageMetrics { + pub db_size_bytes: u64, + pub num_keys: HashMap, + pub last_snapshot_duration_ms: u64, + pub write_ops_total: u64, + pub read_ops_total: u64, + pub bytes_written: u64, + pub bytes_read: u64, +} + +impl StorageMetrics { + pub fn new() -> Self; + pub fn update_from_db(&mut self, db: &DB); +} +``` + +**Update Frequency**: On-demand via `storage.metrics()` call. No background thread needed in Phase 1. + +#### StorageError (error.rs) +**Rich error context using thiserror** + +```rust +#[derive(Debug, thiserror::Error)] +pub enum StorageError { + #[error("RocksDB error: {0}")] + RocksDb(#[from] rocksdb::Error), + + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + #[error("Protobuf decode error: {0}")] + ProtobufDecode(#[from] prost::DecodeError), + + #[error("Column family not found: {0}")] + ColumnFamilyNotFound(String), + + #[error("Invalid log index: expected {expected}, got {got}")] + InvalidLogIndex { expected: u64, got: u64 }, + + #[error("Snapshot failed at {path:?}: {reason}")] + SnapshotFailed { path: PathBuf, reason: String }, + + #[error("Corrupted data in CF {cf}, key {key:?}: {reason}")] + CorruptedData { cf: String, key: Vec, reason: String }, + + #[error("Version mismatch: expected {expected}, got {got}")] + VersionMismatch { expected: u8, got: u8 }, + + #[error("Invalid configuration for {field}: {reason}")] + InvalidConfig { field: String, reason: String }, +} + +pub type Result = std::result::Result; +``` + +**Propagation Strategy**: All public methods return `Result`. Caller (raft crate) decides retry/fail/panic strategy. + +## Implementation Design + +### RocksDB Configuration + +#### Column Family Setup + +**6 Column Families with optimized settings**: + +1. **system_raft_log** - System group Raft log + - Optimize for: Sequential writes, range scans + - Compaction: Level style, aggressive (keep log size small) + - Key format: `"log:{:020}"` (e.g., `"log:00000000000000000142"`) - zero-padded for correct lexicographic ordering + - Value: Protobuf serialized LogEntry (via prost) + +2. **system_raft_state** - System group hard state + - Optimize for: Single-key updates with fsync + - Compaction: Disabled (always <1KB) + - Key format: `"state"` (single entry) + - Value: Protobuf serialized RaftHardState (via prost) + +3. **system_data** - Cluster metadata + - Optimize for: Small number of keys (<10), infrequent updates + - Compaction: Disabled (bounded size ~100KB) + - Keys: `"membership"`, `"shardmap"` + - Values: Protobuf serialized ClusterMembership or ShardMap (via prost) + +4. **data_raft_log** - Data shard Raft log + - Optimize for: High-throughput sequential writes, range scans + - Compaction: Level style, moderate + - Key format: `"log:{:020}"` (zero-padded for correct lexicographic ordering) + - Value: Protobuf serialized LogEntry (via prost) + +5. **data_raft_state** - Data shard hard state + - Optimize for: Single-key updates with fsync + - Compaction: Disabled (always <1KB) + - Key format: `"state"` + - Value: Protobuf serialized RaftHardState (via prost) + +6. **data_kv** - User key-value data + - Optimize for: Random reads/writes, high key count + - Bloom filters: Enabled (10 bits per key) + - Prefix extractor: 4-byte hash prefix + - Key format: Raw user bytes (no encoding) + - Value: Protobuf serialized StoredValue (via prost) + +#### Write-Ahead Log (WAL) + +- **Enabled**: True for all writes +- **Sync Mode**: + - Synchronous fsync for `*_raft_state` CFs (durability guarantee) + - Async for other CFs (rely on WAL + background fsync) +- **WAL Directory**: Same as `data_dir` in Phase 1 (simplify config) +- **WAL Size Limit**: 64MB (triggers rotation) + +#### Compaction Strategy + +- **Style**: Level compaction (better space efficiency than universal) +- **L0 Trigger**: 4 SST files (triggers L0→L1 compaction) +- **Target File Size**: 64MB per SST file +- **Max Bytes L1**: 256MB (L1 size before L1→L2 compaction) +- **Compression**: + - None for L0/L1 (hot data, optimize CPU) + - Lz4 for L2+ (balance CPU vs disk space) + +#### Memory Configuration + +**Default Settings** (from `StorageOptions::default()`): +- **Write Buffer Size**: 64MB per CF +- **Max Write Buffers**: 3 memtables per CF +- **Target SST File Size**: 64MB +- **Max Open Files**: -1 (unlimited) +- **Compression**: Lz4 (L2+), None (L0/L1) + +**Memory Usage Breakdown**: +- **Write Buffer**: 64MB per CF × 3 memtables = 192MB per CF +- **Total Memtables**: 192MB × 6 CFs = ~1.1GB max +- **Block Cache**: 256MB shared across all CFs +- **Total Memory**: ~1.5GB in steady state + +**Per-CF Overrides**: +- **Raft State CFs**: 4MB write buffer (small, single-key updates) +- **System Data CF**: 8MB write buffer (bounded size ~100KB) +- **Log/Data CFs**: Use global 64MB default + +#### Bloom Filters + +- **Enabled**: Only for `data_kv` CF +- **Bits per Key**: 10 (1% false positive rate) +- **Purpose**: Reduce disk reads for GET operations +- **Not Needed**: Raft log access is sequential (range scans) + +### Durability Strategy + +**Fsync Policy**: +- `*_raft_state` CFs: **Synchronous fsync** before returning from `put()` - critical for Raft safety +- Other CFs: **Async WAL writes** - rely on WAL for durability, fsync in background +- Batch writes: If batch touches any `raft_state` CF, **fsync entire batch atomically** + +**Crash Recovery**: RocksDB automatically replays WAL on startup. Storage layer just validates schema versions after opening DB. + +**Checkpoint Atomicity**: RocksDB checkpoint uses hard links - either full checkpoint exists or none (no partial states). + +### Serialization Strategy + +**Format**: Protobuf (prost) for all storage serialization +- **Rationale**: Single format for storage + network, schema evolution, industry alignment +- **Schema Definition**: .proto files in `protocol/` directory +- **Code Generation**: prost-build generates Rust types at compile time +- **No bincode**: Protobuf used exclusively throughout the system (see `openraft/SERIALIZATION_DECISION.md`) + +**Version Handling**: Protobuf provides built-in schema evolution +```proto +message LogEntry { + uint64 version = 1; // CURRENT_VERSION = 1 in Phase 1 + uint64 index = 2; + uint64 term = 3; + bytes data = 4; +} +``` + +**Migration Path**: +- Protobuf handles forward/backward compatibility through optional fields and default values +- If incompatible schema change: Increment version field, implement migration logic +- If `version > CURRENT_VERSION`: **Refuse to start** (cannot read future formats) + +## Data Flows + +### GET Operation Flow + +``` +1. Client calls: storage.get(ColumnFamily::DataKv, b"foo") +2. Storage looks up CF handle from cf_handles HashMap +3. Storage calls: db.get_cf(cf_handle, b"foo") -> Result>> +4. RocksDB searches: Memtable → Block cache → Bloom filter → SST files +5. Storage updates: metrics.read_ops_total += 1 +6. Storage returns: Ok(Some(value)) or Ok(None) +``` + +**Performance**: O(log n) with bloom filter optimization. Target <1ms p99. + +### PUT Operation Flow + +``` +1. Client calls: storage.put(ColumnFamily::DataRaftState, b"state", bytes) +2. Storage checks: cf.requires_fsync() -> true for raft_state CFs +3. Storage creates WriteOptions with sync=true +4. Storage calls: db.put_cf_opt(cf_handle, b"state", bytes, write_opts) +5. RocksDB appends to WAL and fsyncs to disk (blocks ~1-5ms on SSD) +6. RocksDB inserts into memtable (in-memory) +7. Storage updates: metrics.write_ops_total += 1 +8. Storage returns: Ok(()) +``` + +**Performance**: O(1) amortized. May trigger background compaction (doesn't block). + +### Batch Write Operation Flow + +``` +1. Client creates: let mut batch = WriteBatch::new() +2. Client adds ops: batch.put(ColumnFamily::DataKv, b"key1", b"val1") +3. Client adds ops: batch.put(ColumnFamily::DataRaftState, b"state", bytes) +4. Client commits: storage.batch_write(batch) +5. Storage checks: batch.requires_fsync() -> true (DataRaftState touched) +6. Storage creates WriteOptions with sync=true +7. Storage calls: db.write_opt(batch.inner, write_opts) +8. RocksDB applies all operations atomically, fsyncs WAL +9. Storage updates metrics +10. Storage returns: Ok(()) +``` + +**Atomicity**: All operations succeed or all fail. No partial writes visible to readers. + +**Performance**: Amortizes fsync cost across multiple operations. Batch 10-100 operations for best throughput. + +### Snapshot Creation Flow + +``` +1. Client calls: storage.create_snapshot(Path::new("/data/snapshots/snap-20250125")) +2. Storage creates: Checkpoint::new(&db)? +3. Storage calls: checkpoint.create_checkpoint(path)? +4. RocksDB creates hard links to all SST files atomically (~10ms, no data copy) +5. Storage reads: db.latest_sequence_number() for metadata +6. Storage stats: checkpoint directory for size_bytes +7. Storage creates: SnapshotMetadata { last_included_index, last_included_term, created_at, size_bytes } +8. Storage updates: metrics.last_snapshot_duration_ms +9. Storage returns: Ok(metadata) +``` + +**Efficiency**: O(1) time - uses hard links (no data copying). Zero additional disk space initially. Space diverges as original DB changes. + +### OpenRaft Integration Flow + +**IMPORTANT**: After OpenRaft migration, this integration uses openraft storage traits, not raft-rs. + +``` +Read Path (via RaftLogReader trait): +1. openraft calls: RaftLogReader::try_get_log_entries(range) +2. OpenRaftMemStorage (in raft crate) selects CF based on shard_id +3. OpenRaftMemStorage calls: storage.get_log_range(cf, range.start, range.end) +4. Storage reads from RocksDB: db.get_cf(cf, b"log:{:020}") +5. Storage returns: Vec> (serialized log entries) +6. OpenRaftMemStorage deserializes: Vec -> LogEntry +7. OpenRaftMemStorage returns: Vec> to openraft + +Write Path (via RaftStorage trait): +1. openraft calls: RaftStorage::append(entries) +2. OpenRaftMemStorage serializes: LogEntry -> Vec (protobuf via prost) +3. OpenRaftMemStorage selects CF based on shard_id +4. OpenRaftMemStorage calls: storage.append_log_entry(cf, index, &bytes) +5. Storage validates: No gaps in log indices (fails with InvalidLogIndex if gap detected) +6. Storage calls: db.put_cf(data_raft_log, b"log:{:020}", bytes) +7. Storage returns: Ok(()) +8. OpenRaftMemStorage returns: Success to openraft +``` + +**Separation of Concerns**: +- **OpenRaftMemStorage (raft crate)**: Implements openraft storage traits, handles serialization, understands Raft semantics +- **Storage (storage crate)**: Pure persistence layer, stores bytes as directed, no Raft knowledge + +**Key Differences from raft-rs**: +- OpenRaft uses three separate traits (RaftLogReader, RaftSnapshotBuilder, RaftStorage) instead of single Storage trait +- All trait methods are async (requires tokio runtime) +- Different entry types: `LogEntry` instead of `eraftpb::Entry` +- Storage layer remains synchronous (RocksDB operations), async wrapper in raft crate + +## Module Organization + +### Source Files + +``` +storage/ +├── src/ +│ ├── lib.rs # Public API, Storage struct, re-exports +│ ├── column_family.rs # ColumnFamily enum with CF metadata +│ ├── batch.rs # WriteBatch builder for atomic operations +│ ├── iterator.rs # StorageIterator with snapshot isolation +│ ├── error.rs # StorageError with thiserror +│ ├── metrics.rs # StorageMetrics tracking +│ ├── snapshot.rs # Checkpoint creation/restoration helpers +│ └── options.rs # StorageOptions and CFOptions configuration +├── tests/ +│ ├── integration_tests.rs # Full Storage workflows with temp RocksDB +│ ├── property_tests.rs # proptest for serialization roundtrip, atomicity +│ └── common/ +│ └── mod.rs # Shared test utilities (temp dir, sample data) +└── Cargo.toml +``` + +### Public API Surface + +**Exported from `lib.rs`**: +```rust +pub use storage::Storage; +pub use column_family::ColumnFamily; +pub use batch::WriteBatch; +pub use iterator::{StorageIterator, IteratorMode}; +pub use options::{StorageOptions, CFOptions}; +pub use metrics::StorageMetrics; +pub use error::{StorageError, Result}; +``` + +### Internal Helpers (Private) + +- CF handle caching in `HashMap>` +- RocksDB initialization logic in `Storage::new()` +- `WriteOptions` creation based on fsync requirements +- Metrics update helper methods +- Key formatting: `format_log_key(index: u64) -> String` returns `"log:{:020}"` (zero-padded for correct lexicographic ordering) + +## Integration Design + +### Raft Crate Integration + +**IMPORTANT**: This section describes the integration AFTER OpenRaft migration is complete. + +**OpenRaft Storage Implementation**: +```rust +// In raft crate (crates/raft/src/storage.rs or crates/storage/src/openraft_storage.rs) +use openraft::storage::{RaftLogReader, RaftSnapshotBuilder, RaftStorage as OpenRaftStorageTrait}; +use seshat_storage::Storage; // RocksDB storage layer + +pub struct OpenRaftMemStorage { + storage: Arc, + shard_id: u64, // System (0) or data shard ID +} + +// Read operations +#[async_trait] +impl RaftLogReader for OpenRaftMemStorage { + async fn try_get_log_entries( + &mut self, + range: std::ops::Range + ) -> Result>> { + let cf = self.select_log_cf(); + let bytes_vec = self.storage.get_log_range(cf, range.start, range.end)?; + + // Deserialize each entry using protobuf + use prost::Message; + bytes_vec.into_iter() + .map(|bytes| LogEntry::decode(&bytes[..])) + .collect() + } + + async fn read_vote(&mut self) -> Result>> { + let cf = self.select_state_cf(); + match self.storage.get(cf, b"vote")? { + Some(bytes) => { + use prost::Message; + Ok(Some(Vote::decode(&bytes[..])?)) + }, + None => Ok(None), + } + } + + // ... other RaftLogReader methods +} + +// Write operations +#[async_trait] +impl OpenRaftStorageTrait for OpenRaftMemStorage { + async fn append( + &mut self, + entries: I + ) -> Result<()> + where + I: IntoIterator> + Send, + { + let cf = self.select_log_cf(); + + for entry in entries { + use prost::Message; + let bytes = entry.encode_to_vec(); + self.storage.append_log_entry(cf, entry.log_id.index, &bytes)?; + } + Ok(()) + } + + async fn save_vote(&mut self, vote: &Vote) -> Result<()> { + let cf = self.select_state_cf(); + use prost::Message; + let bytes = vote.encode_to_vec(); + self.storage.put(cf, b"vote", &bytes)?; + Ok(()) + } + + // ... other RaftStorage methods +} + +impl OpenRaftMemStorage { + fn select_log_cf(&self) -> ColumnFamily { + if self.shard_id == 0 { + ColumnFamily::SystemRaftLog + } else { + ColumnFamily::DataRaftLog + } + } + + fn select_state_cf(&self) -> ColumnFamily { + if self.shard_id == 0 { + ColumnFamily::SystemRaftState + } else { + ColumnFamily::DataRaftState + } + } +} +``` + +**Responsibilities**: +- **OpenRaftMemStorage**: Implements openraft storage traits, handles protobuf serialization/deserialization, CF selection +- **Storage**: Pure persistence (stores raw bytes), thread-safety, atomicity, RocksDB operations + +**Serialization Strategy**: +- **Format**: Protobuf (prost) for all storage serialization +- **Rationale**: Single format for storage + network, schema evolution, industry alignment (see `openraft/SERIALIZATION_DECISION.md`) +- **No bincode**: Protobuf used exclusively throughout the system + +### Common Crate Dependencies + +**Data Structures Defined in Protobuf Schema**: +```proto +// protocol/raft.proto +message LogEntry { + uint64 version = 1; + uint64 index = 2; + uint64 term = 3; + bytes data = 4; +} + +message RaftHardState { + uint64 version = 1; + uint64 term = 2; + uint64 vote = 3; + uint64 commit = 4; +} + +// protocol/storage.proto +message StoredValue { + uint64 version = 1; + bytes data = 2; + uint64 created_at = 3; + optional uint64 ttl = 4; +} + +// protocol/cluster.proto +message ClusterMembership { + uint64 version = 1; + repeated NodeInfo nodes = 2; +} + +message ShardMap { + uint64 version = 1; + repeated ShardInfo shards = 2; +} + +// protocol/snapshot.proto +message SnapshotMetadata { + uint64 last_included_index = 1; + uint64 last_included_term = 2; + uint64 created_at = 3; + uint64 size_bytes = 4; +} +``` + +**Dependency Flow**: +- `storage` crate: Stores bytes, no knowledge of data structures +- `protocol` directory: Defines protobuf schemas shared across crates +- `raft` crate: Serializes protobuf messages using prost before passing to storage +- `common` crate: Re-exports generated protobuf types for convenience + +## Performance Considerations + +### Latency Optimization (<1ms p99 target) + +1. **Bloom Filters**: Reduce disk reads for `data_kv` GET operations +2. **Block Cache**: 256MB shared cache for frequently accessed blocks +3. **Write Buffers**: 64MB per CF batches writes before flushing to disk +4. **Async WAL**: Most writes don't block on fsync (only `raft_state` CFs) +5. **No Extra Serialization**: Direct `&[u8]` API avoids unnecessary copies +6. **Arc**: Zero-cost sharing across threads (no cloning DB instance) + +### Batch Write Strategy + +**Motivation**: Amortize fsync cost across multiple operations + +**Use Cases**: +- Raft log replication: Batch multiple log entries in one atomic write +- State machine application: Batch user KV writes + Raft state update +- Log compaction: Batch truncation of old log entries + +**Size Limits**: No hard limit in Phase 1. Consider 10MB batches in Phase 2+ to avoid memory spikes. + +### Snapshot Efficiency + +- **Checkpoint Creation**: O(1) time using hard links - typically <10ms for 10GB dataset +- **Zero Space Initially**: Hard links share data until original DB diverges +- **Restoration**: O(n) time - must copy all files to `data_dir` (~1-2s per GB) +- **No Write Blocking**: Checkpoint creation doesn't block reads/writes +- **Manual Cleanup**: Raft crate responsible for deleting old checkpoints + +### Memory Considerations + +**Total Memory Budget**: ~1.5GB +- Write buffers: 1.1GB (64MB × 3 × 6 CFs) +- Block cache: 256MB +- Iterator snapshots: Minimal overhead (bounded by number of active iterators) + +**Backpressure**: RocksDB automatically stalls writes if memtables full (prevents OOM). + +### Disk I/O Patterns + +- **Raft Log**: Sequential writes, occasional sequential reads (for replication) +- **Raft State**: Random writes with fsync, infrequent reads (on restart) +- **Data KV**: Random reads and writes, high IOPS workload +- **Compaction**: Background sequential reads + writes, throttled to avoid impacting foreground +- **WAL**: Sequential writes, flushed per-write for state CFs + +## Phase 1 Scope + +### Included in Phase 1 + +- All 6 column families with optimized settings +- CRUD operations: `get`, `put`, `delete`, `exists` +- Atomic batch writes across CFs +- Raft log operations: `append_log_entry`, `get_log_range`, `truncate_log_before` +- Snapshot creation/restoration using RocksDB checkpoints +- Iterator support for range queries +- Basic metrics tracking +- Comprehensive error handling with `thiserror` +- Full test coverage (unit + integration + property tests) + +### Deferred to Phase 2+ + +- **Dynamic CF Creation**: Phase 2 adds per-shard CFs (`ShardRaftLog(shard_id)`) + - May require DB restart to add new CFs in Phase 2 + - Consider using single `data_raft_log` CF with prefix keys in Phase 2 to avoid restarts + +- **Advanced Compaction Control**: Manual compaction policies, TTL-based compaction + - Phase 1 uses RocksDB defaults (sufficient for single-shard workload) + +- **Per-Shard Metrics**: Phase 2 tracks metrics per shard, not just per CF + - Requires key prefix parsing to attribute metrics to shards + +- **Background Metrics Collection**: Phase 1 updates metrics on-demand via `metrics()` call + - Phase 4 adds background thread to push metrics to Prometheus + +- **Separate WAL Directory**: Phase 1 uses same dir for WAL and SST files + - Phase 4 may separate WAL to different disk for performance + +- **Online Schema Migration**: Phase 1 validates versions, refuses to start if mismatch + - Phase 3 adds online migration to handle version upgrades gracefully + +### Testing Strategy + +**Unit Tests** (in each module): +```rust +#[cfg(test)] +mod tests { + // Test ColumnFamily enum methods + // Test WriteBatch builder pattern + // Test error conversions + // Test metrics updates +} +``` + +**Integration Tests** (`tests/integration_tests.rs`): +```rust +// Test full CRUD workflows with temporary RocksDB +// Test batch atomicity (partial failure scenarios) +// Test snapshot creation/restoration roundtrip +// Test iterator snapshot isolation +// Test concurrent reads/writes from multiple threads +// Test crash recovery (kill process, reopen DB) +``` + +**Property Tests** (`tests/property_tests.rs`): +```rust +// proptest: Serialization roundtrip for all versioned structs +// proptest: Batch operations preserve atomicity +// proptest: Iterator sees consistent view during concurrent writes +``` + +--- + +**Cross-references**: +- Architecture: `/Users/martinrichards/code/seshat/docs/architecture/crates.md` - How storage fits into 8-crate structure +- Data Structures: `/Users/martinrichards/code/seshat/docs/architecture/data-structures.md` - `VersionedLogEntry`, `RaftHardState` definitions +- Tech Stack: `/Users/martinrichards/code/seshat/docs/standards/tech.md` - RocksDB choice rationale +- TDD Workflow: `/Users/martinrichards/code/seshat/docs/standards/practices.md` - Test → Code → Refactor pattern diff --git a/docs/specs/rocksdb/requirements.json b/docs/specs/rocksdb/requirements.json new file mode 100644 index 0000000..5995b27 --- /dev/null +++ b/docs/specs/rocksdb/requirements.json @@ -0,0 +1,39 @@ +{ + "raw_user_story": "As a Seshat node operator, I want persistent storage using RocksDB so that the cluster can recover state after restarts and maintain consistency across nodes.", + "raw_criteria": [ + "Store implements 6 column families (keys, raft_log, raft_state, snapshots, metadata, locks)", + "Key operations (get, set, delete, exists) complete with <1ms p99 latency", + "Raft log operations (append, range query, truncate) preserve ordering", + "Snapshot creation and restoration work correctly", + "All data survives node restarts (persistence verification)", + "Column family isolation prevents cross-contamination" + ], + "raw_rules": [ + "Keys CF: TTL support (Phase 2+), no size limit enforced yet", + "Raft log CF: Sequential log indices, no gaps allowed", + "Raft state CF: Single-key storage (current term, voted for)", + "Snapshots CF: Store snapshot metadata + data blob", + "Metadata CF: Cluster config, node ID", + "Locks CF: Distributed locking (Phase 2+)", + "Write batches must be atomic across column families", + "Read-heavy workload optimization (caching, bloom filters)" + ], + "raw_scope": { + "included": [ + "RocksDB initialization with 6 column families", + "CRUD operations per column family", + "Atomic batch writes", + "Raft log operations (append, get range, truncate)", + "Snapshot storage and retrieval", + "Configuration management", + "Error handling and retries" + ], + "excluded": [ + "TTL expiration (Phase 2)", + "Distributed locking (Phase 2)", + "Metrics/observability (Phase 4)", + "Multi-shard support (Phase 2)", + "Online schema migration (Phase 3)" + ] + } +} diff --git a/docs/specs/rocksdb/spec-lite.md b/docs/specs/rocksdb/spec-lite.md new file mode 100644 index 0000000..20ce894 --- /dev/null +++ b/docs/specs/rocksdb/spec-lite.md @@ -0,0 +1,43 @@ +# RocksDB Storage Layer - Lite Specification + +## Feature Overview + +RocksDB-based storage layer providing persistent, durable, and performant storage for Seshat's distributed key-value store. Supports Raft consensus requirements with 6 strategically designed column families. + +## Key Acceptance Criteria + +1. Initialize 6 column families with optimized RocksDB configuration +2. Achieve <1ms p99 latency for local storage operations +3. Ensure atomic, gap-free Raft log entries across column families +4. Create RocksDB checkpoints with metadata preservation +5. Enable node restart with full data recovery and consistency + +## Critical Technical Details + +### Column Families +- `system_raft_log`: System Raft log entries (compacted, ~10MB) +- `system_raft_state`: Persistent Raft hard state +- `system_data`: Cluster metadata +- `data_raft_log`: Data shard log entries +- `data_raft_state`: Data shard hard state +- `data_kv`: User key-value data + +### Performance Targets +- Single key ops: <1ms p99 latency +- Throughput: >5,000 ops/sec +- Snapshot creation: <10s (100MB) + +## Dependencies +- Dependencies: rocksdb, bincode, serde, thiserror +- Integrated via: raft crate, kv crate +- Storage trait implementation for raft-rs + +## Implementation Notes +- No Raft semantic understanding +- Version fields for future schema evolution +- Atomic batch writes +- Hard link-based snapshots +- Thread-safe concurrent operations + +## Alignment +Phase 1 MVP storage foundation - enables cluster recovery, Raft consensus durability, and key-value data persistence. \ No newline at end of file diff --git a/docs/specs/rocksdb/spec.json b/docs/specs/rocksdb/spec.json new file mode 100644 index 0000000..882c772 --- /dev/null +++ b/docs/specs/rocksdb/spec.json @@ -0,0 +1,216 @@ +{ + "feature": "rocksdb-storage", + "user_story": "As a Seshat node operator, I want persistent storage using RocksDB so that the cluster can recover state after restarts and maintain consistency across nodes", + "acceptance_criteria": [ + "GIVEN a fresh node startup WHEN RocksDB initializes THEN all 6 column families (system_raft_log, system_raft_state, system_data, data_raft_log, data_raft_state, data_kv) are created with correct configuration", + "GIVEN a key-value operation WHEN storage.get/put/delete is called THEN operation completes with <1ms p99 latency for local storage access", + "GIVEN Raft log entries WHEN append/get_range/truncate operations execute THEN sequential ordering is preserved and no gaps exist in log indices", + "GIVEN an atomic batch write across multiple column families WHEN batch.commit is called THEN either all writes succeed or all fail (no partial commits)", + "GIVEN a snapshot trigger condition (10,000 entries OR 100MB) WHEN snapshot is created THEN RocksDB checkpoint succeeds and metadata records last_included_index", + "GIVEN a node restart WHEN RocksDB reopens existing database THEN all persisted data (keys, raft state, metadata) is accessible and version checks pass", + "GIVEN operations on different column families WHEN concurrent reads/writes occur THEN data isolation is maintained (no cross-contamination between CFs)" + ], + "business_rules": [ + "System Raft log CF: Store system group Raft entries, compact after snapshot (~10MB typical size)", + "System Raft state CF: Single-key storage for hard state (term, vote, commit), MUST fsync before responding to RPCs", + "System data CF: Store cluster metadata (ClusterMembership, ShardMap), bounded ~100KB size", + "Data Raft log CF: Store data shard log entries, snapshot every 10,000 entries or 100MB, ~100MB typical compacted size", + "Data Raft state CF: Single-key storage for data shard hard state, MUST fsync before responding to RPCs", + "Data KV CF: Store user key-value data wrapped in StoredValue, unbounded size, optimize for point lookups", + "All write batches across column families MUST be atomic", + "All persisted structures MUST include version field for schema evolution", + "Key size limit: 256 bytes maximum (enforced by validation layer above storage)", + "Value size limit: 65,536 bytes maximum (enforced by validation layer above storage)", + "Raft log memory limit: 512MB per Raft group before forced compaction", + "Storage layer MUST NOT understand Raft semantics - only stores bytes as directed" + ], + "scope": { + "included": [ + "RocksDB initialization with 6 column families and optimized configuration (Lz4 compression, 64MB buffers, prefix bloom filters)", + "CRUD operations per column family (get, put, delete, exists)", + "Atomic batch write operations across multiple column families", + "Raft log operations: append entry, get range of entries, truncate before index", + "Snapshot creation using RocksDB checkpoint (hard links, atomic)", + "Snapshot restoration from checkpoint directory", + "Configuration management: load/store NodeConfig, ClusterConfig, RaftConfig", + "Error handling with rich context propagation (thiserror)", + "Iterator support for range queries within column families", + "Storage metrics tracking (db_size_bytes, num_keys, snapshot_duration)" + ], + "excluded": [ + "TTL expiration logic (Phase 2 - handled by higher layer)", + "Distributed locking implementation (Phase 2 - separate feature)", + "Metrics/observability export (Phase 4 - OpenTelemetry integration)", + "Multi-shard column family management (Phase 2 - dynamic shard creation)", + "Online schema migration tools (Phase 3 - separate migration system)", + "RocksDB tuning dashboard (Phase 4 - operational tooling)", + "Automatic compaction scheduling (use RocksDB defaults for Phase 1)" + ] + }, + "technical_details": { + "column_families": [ + { + "name": "system_raft_log", + "purpose": "System group Raft log entries", + "key_format": "log:{index}", + "value_type": "VersionedLogEntry (bincode)", + "size_estimate": "~10MB compacted", + "compaction": "Truncate after snapshot" + }, + { + "name": "system_raft_state", + "purpose": "System group hard state", + "key_format": "state (single key)", + "value_type": "RaftHardState (bincode)", + "size_estimate": "<1KB", + "durability": "fsync required before RPC responses" + }, + { + "name": "system_data", + "purpose": "Cluster metadata", + "key_format": "membership, shardmap", + "value_type": "ClusterMembership, ShardMap (bincode)", + "size_estimate": "~100KB bounded", + "compaction": "Automatic by RocksDB" + }, + { + "name": "data_raft_log", + "purpose": "Data shard Raft log entries", + "key_format": "log:{index}", + "value_type": "VersionedLogEntry (bincode)", + "size_estimate": "~100MB compacted", + "compaction": "Snapshot every 10,000 entries or 100MB" + }, + { + "name": "data_raft_state", + "purpose": "Data shard hard state", + "key_format": "state (single key)", + "value_type": "RaftHardState (bincode)", + "size_estimate": "<1KB", + "durability": "fsync required before RPC responses" + }, + { + "name": "data_kv", + "purpose": "User key-value data", + "key_format": "raw user key (arbitrary bytes)", + "value_type": "StoredValue (bincode)", + "size_estimate": "unbounded (user data)", + "optimization": "Prefix bloom filters for point lookups" + } + ], + "data_structures": [ + "VersionedLogEntry - Raft log entry with schema version, term, index, entry_type, data", + "RaftHardState - Persistent Raft state: version, term, vote, commit", + "ClusterMembership - Node registry with addresses and states", + "ShardMap - Shard assignments and replica placement", + "StoredValue - User data wrapper: version, data, created_at, expires_at (optional)", + "SnapshotMetadata - Snapshot tracking: last_included_index, last_included_term, membership, created_at, size_bytes" + ], + "rocksdb_configuration": { + "compression": "Lz4 (fast compression for all CFs)", + "write_buffer_size": "64MB per CF", + "max_write_buffer_number": 3, + "target_file_size_base": "64MB SST files", + "raft_log_compaction_style": "Level (sequential writes)", + "data_kv_optimization": "Fixed prefix (4 bytes) for hash-based routing, memtable prefix bloom 0.2 ratio" + }, + "snapshot_strategy": { + "trigger": "Every 10,000 log entries OR 100MB log size", + "method": "RocksDB checkpoint using hard links (atomic, space-efficient)", + "process": [ + "Create checkpoint at snapshots/snapshot-{timestamp}", + "Record SnapshotMetadata with last_included_index and last_included_term", + "Truncate Raft log entries before last_included_index", + "Update Raft state with new snapshot reference" + ], + "restoration": [ + "Copy checkpoint directory to node data_dir", + "Open RocksDB instance", + "Read last_included_index from snapshot metadata", + "Replay log entries after snapshot if any exist" + ] + }, + "performance_requirements": { + "local_storage_ops_p99": "<1ms (get, put, delete on single key)", + "batch_commit_p99": "<5ms (atomic writes across CFs)", + "snapshot_creation": "<10s for 100MB data", + "throughput_target": ">5,000 ops/sec per node (includes Raft overhead above storage)", + "concurrent_operations": "Thread-safe for concurrent reads and writes" + }, + "durability_requirements": "Raft state CFs (system_raft_state, data_raft_state) MUST fsync before responding to RequestVote or AppendEntries RPCs. Other CFs use RocksDB default durability (WAL with periodic sync).", + "error_handling": "Use thiserror for library errors. Propagate RocksDB errors with context. Fail fast on version mismatches. Return Result for all operations." + }, + "dependencies": { + "depends_on": [ + "common crate - shared types (Error, Result, configuration structs)", + "rocksdb crate - underlying storage engine (v0.22+)", + "bincode crate - efficient binary serialization", + "serde crate - serialization trait implementations", + "thiserror crate - error type definitions" + ], + "used_by": [ + "raft crate - implements raft-rs Storage trait using this storage layer", + "kv crate - indirectly via raft crate for persisting key-value operations", + "seshat binary - orchestrates initialization and lifecycle" + ], + "integration_points": [ + "raft-rs Storage trait - storage layer must provide: append entries, get entries, snapshot, apply snapshot, hard state persistence", + "common::types - all data structures defined in data-structures.md", + "config loading - NodeConfig specifies data_dir path for RocksDB" + ] + }, + "aligns_with": "Phase 1 MVP - Persistent storage foundation for single-shard cluster. Enables cluster recovery after restarts, provides durability for Raft consensus, and stores user key-value data. Critical blocker for 3-node cluster stability testing.", + "testing_strategy": { + "unit_tests": [ + "Column family initialization and configuration verification", + "CRUD operations per column family with various data sizes", + "Atomic batch write success and rollback scenarios", + "Iterator range queries within column families", + "Version checking and deserialization error handling", + "Error propagation and context preservation", + "Concurrent read/write operations (thread safety)", + "Storage metrics accuracy (db_size, key count)" + ], + "integration_tests": [ + "Snapshot creation and restoration full workflow", + "Node restart with existing database (persistence verification)", + "Raft log truncation after snapshot (gap detection)", + "Cross-column-family atomic batch writes", + "Configuration load/store roundtrip", + "Data isolation between column families (no contamination)", + "Large dataset operations (simulate 100MB log, 10K keys)" + ], + "property_tests": [ + "Serialization roundtrip for all data structures (proptest)", + "Batch write atomicity under random operation sequences", + "Iterator correctness for arbitrary key ranges", + "Snapshot consistency under concurrent writes" + ], + "performance_tests": [ + "Single key operation latency (p50, p99, p999) must be <1ms p99", + "Batch write throughput with varying batch sizes", + "Snapshot creation time for 100MB dataset (<10s)", + "Concurrent operation throughput (simulate 5,000 ops/sec load)", + "Memory usage under sustained write load (must stay under 512MB per Raft log)" + ], + "chaos_tests_support": [ + "Test 5: Storage media failure - verify graceful error handling and restart recovery", + "Test 11: Storage saturation - enforce resource limits and reject operations cleanly" + ], + "tdd_workflow": "RED → GREEN → REFACTOR pattern. Write failing test first, implement minimal code to pass, refactor for clarity. All acceptance criteria must have corresponding tests before implementation." + }, + "validation_checks": { + "conflicts_with_existing": "None - RESP protocol is complete and independent. Raft implementation expects this storage layer via Storage trait.", + "missing_requirements": "None identified - all architectural requirements from data-structures.md and tech.md are covered.", + "testability": "All acceptance criteria are testable with concrete pass/fail conditions (latency thresholds, data persistence verification, atomicity checks).", + "phase_1_alignment": "Fully aligned - RocksDB storage is listed as P2-HIGH priority in roadmap with 12-15h estimated effort. Blocks cluster stability testing and chaos tests." + }, + "notes": [ + "This storage layer is deliberately simple - no understanding of Raft semantics, just byte storage with column family organization", + "Phase 1 uses fixed 6 column families. Phase 2 will add per-shard CFs dynamically", + "TTL handling is deferred to Phase 2 but StoredValue structure includes expires_at field for future use", + "Observability metrics are tracked but not exported until Phase 4 OpenTelemetry integration", + "All data structures include version fields from day 1 to enable future schema evolution without breaking changes", + "RocksDB checkpoint snapshots use hard links - fast and space-efficient, no data copying required" + ] +} diff --git a/docs/specs/rocksdb/spec.md b/docs/specs/rocksdb/spec.md new file mode 100644 index 0000000..5ce9381 --- /dev/null +++ b/docs/specs/rocksdb/spec.md @@ -0,0 +1,147 @@ +# RocksDB Storage Layer Specification + +## User Story + +As a Seshat node operator, I want persistent storage using RocksDB so that the cluster can recover state after restarts and maintain consistency across nodes + +## Acceptance Criteria + +1. GIVEN a fresh node startup WHEN RocksDB initializes THEN all 6 column families (system_raft_log, system_raft_state, system_data, data_raft_log, data_raft_state, data_kv) are created with correct configuration + +2. GIVEN a key-value operation WHEN storage.get/put/delete is called THEN operation completes with <1ms p99 latency for local storage access + +3. GIVEN Raft log entries WHEN append/get_range/truncate operations execute THEN sequential ordering is preserved and no gaps exist in log indices + +4. GIVEN an atomic batch write across multiple column families WHEN batch.commit is called THEN either all writes succeed or all fail (no partial commits) + +5. GIVEN a snapshot trigger condition (10,000 entries OR 100MB) WHEN snapshot is created THEN RocksDB checkpoint succeeds and metadata records last_included_index + +6. GIVEN a node restart WHEN RocksDB reopens existing database THEN all persisted data (keys, raft state, metadata) is accessible and version checks pass + +7. GIVEN operations on different column families WHEN concurrent reads/writes occur THEN data isolation is maintained (no cross-contamination between CFs) + +## Business Rules + +- System Raft log CF: Store system group Raft entries, compact after snapshot (~10MB typical size) +- System Raft state CF: Single-key storage for hard state (term, vote, commit), MUST fsync before responding to RPCs +- System data CF: Store cluster metadata (ClusterMembership, ShardMap), bounded ~100KB size +- Data Raft log CF: Store data shard log entries, snapshot every 10,000 entries or 100MB, ~100MB typical compacted size +- Data Raft state CF: Single-key storage for data shard hard state, MUST fsync before responding to RPCs +- Data KV CF: Store user key-value data wrapped in StoredValue, unbounded size, optimize for point lookups +- All write batches across column families MUST be atomic +- All persisted structures MUST include version field for schema evolution +- Key size limit: 256 bytes maximum (enforced by validation layer above storage) +- Value size limit: 65,536 bytes maximum (enforced by validation layer above storage) +- Raft log memory limit: 512MB per Raft group before forced compaction +- Storage layer MUST NOT understand Raft semantics - only stores bytes as directed + +## Scope + +### Included + +- RocksDB initialization with 6 column families and optimized configuration (Lz4 compression, 64MB buffers, prefix bloom filters) +- CRUD operations per column family (get, put, delete, exists) +- Atomic batch write operations across multiple column families +- Raft log operations: append entry, get range of entries, truncate before index +- Snapshot creation using RocksDB checkpoint (hard links, atomic) +- Snapshot restoration from checkpoint directory +- Configuration management: load/store NodeConfig, ClusterConfig, RaftConfig +- Error handling with rich context propagation (thiserror) +- Iterator support for range queries within column families +- Storage metrics tracking (db_size_bytes, num_keys, snapshot_duration) + +### Excluded + +- TTL expiration logic (Phase 2 - handled by higher layer) +- Distributed locking implementation (Phase 2 - separate feature) +- Metrics/observability export (Phase 4 - OpenTelemetry integration) +- Multi-shard column family management (Phase 2 - dynamic shard creation) +- Online schema migration tools (Phase 3 - separate migration system) +- RocksDB tuning dashboard (Phase 4 - operational tooling) +- Automatic compaction scheduling (use RocksDB defaults for Phase 1) + +## Dependencies + +### Depends On +- common crate - shared types (Error, Result, configuration structs) +- rocksdb crate - underlying storage engine (v0.22+) +- prost crate - protobuf serialization for all storage operations +- serde crate - serialization trait implementations +- thiserror crate - error type definitions +- openraft migration (BLOCKING) - must complete OpenRaft Phase 1-2 before RocksDB storage implementation +- async-trait crate - for async trait implementations + +### Used By +- raft crate - provides OpenRaftMemStorage wrapper that implements openraft storage traits +- kv crate - indirectly via raft crate for persisting key-value operations +- seshat binary - orchestrates initialization and lifecycle + +### Integration Points +- openraft storage traits - storage layer must provide: RaftLogReader, RaftSnapshotBuilder, RaftStorage (openraft version) +- common::types - all data structures defined in data-structures.md +- config loading - NodeConfig specifies data_dir path for RocksDB + +## Technical Details + +### Column Families + +1. system_raft_log + - Purpose: System group Raft log entries + - Key Format: log:{index} + - Value Type: LogEntry (protobuf via prost) + - Size: ~10MB compacted + - Compaction: Truncate after snapshot + +2. system_raft_state + - Purpose: System group hard state + - Key Format: state (single key) + - Value Type: RaftHardState (protobuf via prost) + - Size: <1KB + - Durability: fsync required before RPC responses + +3. system_data + - Purpose: Cluster metadata + - Key Format: membership, shardmap + - Value Type: ClusterMembership, ShardMap (protobuf via prost) + - Size: ~100KB bounded + - Compaction: Automatic by RocksDB + +4. data_raft_log + - Purpose: Data shard Raft log entries + - Key Format: log:{index} + - Value Type: LogEntry (protobuf via prost) + - Size: ~100MB compacted + - Compaction: Snapshot every 10,000 entries or 100MB + +5. data_raft_state + - Purpose: Data shard hard state + - Key Format: state (single key) + - Value Type: RaftHardState (protobuf via prost) + - Size: <1KB + - Durability: fsync required before RPC responses + +6. data_kv + - Purpose: User key-value data + - Key Format: raw user key (arbitrary bytes) + - Value Type: StoredValue (protobuf via prost) + - Size: Unbounded (user data) + - Optimization: Prefix bloom filters for point lookups + +### Performance Requirements + +- Local Storage Ops (p99): <1ms (get, put, delete on single key) +- Batch Commit (p99): <5ms (atomic writes across CFs) +- Snapshot Creation: <10s for 100MB data +- Throughput Target: >5,000 ops/sec per node +- Concurrent Operations: Thread-safe for concurrent reads and writes + +## Alignment + +This feature aligns with Phase 1 MVP - Persistent storage foundation for single-shard cluster. Enables cluster recovery after restarts, provides durability for Raft consensus, and stores user key-value data. Critical blocker for 3-node cluster stability testing. + +## Estimated Effort + +**14-17 hours** total implementation time: +- +2 hours for openraft trait integration complexity +- Async trait implementation overhead +- Testing with openraft types \ No newline at end of file diff --git a/docs/standards/tech.md b/docs/standards/tech.md index 13b7525..736f95a 100644 --- a/docs/standards/tech.md +++ b/docs/standards/tech.md @@ -8,7 +8,7 @@ - **Consensus**: raft-rs 0.7+ - **Storage**: RocksDB 0.22+ (via rocksdb crate) - **RPC Framework**: gRPC (tonic 0.11+ / prost 0.12+) -- **Serialization**: Protobuf (for internal RPC), bincode (for storage) +- **Serialization**: Protobuf (prost 0.12+) for all serialization (internal RPC and storage) - **Error Handling**: thiserror (libraries), anyhow (binary) - **Logging**: tracing + tracing-subscriber - **Testing**: proptest (property tests), tokio-test @@ -188,15 +188,16 @@ Each node runs a **single RocksDB instance** with multiple **column families** f **Example**: ```rust -// Write log entry +// Write log entry (in raft crate, not storage crate) let entry = VersionedLogEntry { version: 1, term: 5, index: 142, entry_type: EntryType::Normal, - data: bincode::serialize(&membership_change)?, + data: membership_change.encode_to_vec(), }; -storage.put("system_raft_log", b"log:142", bincode::serialize(&entry)?)?; +let serialized = entry.encode_to_vec(); +storage.put("system_raft_log", b"log:142", &serialized)?; ``` #### 2. `system_raft_state` @@ -214,14 +215,15 @@ storage.put("system_raft_log", b"log:142", bincode::serialize(&entry)?)?; **Example**: ```rust -// Update hard state +// Update hard state (in raft crate, not storage crate) let hard_state = RaftHardState { version: 1, term: 5, vote: Some(1), // Voted for node 1 commit: 142, }; -storage.put("system_raft_state", b"state", bincode::serialize(&hard_state)?)?; +let serialized = hard_state.encode_to_vec(); +storage.put("system_raft_state", b"state", &serialized)?; storage.sync()?; // CRITICAL: fsync before responding ``` @@ -230,7 +232,7 @@ storage.sync()?; // CRITICAL: fsync before responding **Key Format**: `membership` and `shardmap` -**Value Format**: Bincode-serialized `ClusterMembership` / `ShardMap` +**Value Format**: Protobuf-serialized `ClusterMembership` / `ShardMap` **Update Frequency**: On membership changes, shard rebalancing @@ -238,7 +240,7 @@ storage.sync()?; // CRITICAL: fsync before responding **Example**: ```rust -// Store cluster membership +// Store cluster membership (in raft crate, not storage crate) let membership = ClusterMembership { version: 1, members: HashMap::from([ @@ -248,7 +250,8 @@ let membership = ClusterMembership { ]), membership_version: 1, }; -storage.put("system_data", b"membership", bincode::serialize(&membership)?)?; +let serialized = membership.encode_to_vec(); +storage.put("system_data", b"membership", &serialized)?; ``` #### 4. `data_raft_log` @@ -256,7 +259,7 @@ storage.put("system_data", b"membership", bincode::serialize(&membership)?)?; **Key Format**: `log:{index}` -**Value Format**: Bincode-serialized `VersionedLogEntry` +**Value Format**: Protobuf-serialized `VersionedLogEntry` **Compaction**: Snapshot every 10,000 entries or 100MB @@ -267,7 +270,7 @@ storage.put("system_data", b"membership", bincode::serialize(&membership)?)?; **Key Format**: `state` -**Value Format**: Bincode-serialized `RaftHardState` +**Value Format**: Protobuf-serialized `RaftHardState` **Durability**: fsync required @@ -276,7 +279,7 @@ storage.put("system_data", b"membership", bincode::serialize(&membership)?)?; **Key Format**: Raw user key (arbitrary bytes) -**Value Format**: Bincode-serialized `StoredValue` +**Value Format**: Protobuf-serialized `StoredValue` **Size**: Unbounded (user data) @@ -284,18 +287,19 @@ storage.put("system_data", b"membership", bincode::serialize(&membership)?)?; **Example**: ```rust -// SET foo bar +// SET foo bar (in state machine, not storage crate) let value = StoredValue { version: 1, data: b"bar".to_vec(), created_at: current_timestamp_ms(), expires_at: None, }; -storage.put("data_kv", b"foo", bincode::serialize(&value)?)?; +let serialized = value.encode_to_vec(); +storage.put("data_kv", b"foo", &serialized)?; -// GET foo +// GET foo (in state machine, not storage crate) let bytes = storage.get("data_kv", b"foo")?; -let stored: StoredValue = bincode::deserialize(&bytes)?; +let stored: StoredValue = StoredValue::decode(&bytes[..])?; // Returns: b"bar" ```