diff --git a/betree/pmem-hashmap/src/allocator.rs b/betree/pmem-hashmap/src/allocator.rs index e6d8f56c..6c50977f 100644 --- a/betree/pmem-hashmap/src/allocator.rs +++ b/betree/pmem-hashmap/src/allocator.rs @@ -217,9 +217,17 @@ impl Pal { unsafe { pmemobj_close(self.pool.as_ptr()) }; } + pub fn allocate_variable(&self, v: T) -> Result, PalError> { + let mut ptr = self.allocate(std::mem::size_of_val(&v))?; + assert!(ptr.size < 8192); + ptr.init(&v, std::mem::size_of_val(&v)); + Ok(ptr) + } + /// Allocate an area of size in the persistent memory. Allocations are /// always guaranteed to be cache line aligned for Optane PMem (64 bytes). pub fn allocate(&self, size: usize) -> Result, PalError> { + assert!(size < 8192); let mut oid = std::mem::MaybeUninit::::uninit(); if unsafe { haura_alloc( diff --git a/betree/src/replication/mod.rs b/betree/src/replication/mod.rs index 2eaba821..3e84dfbc 100644 --- a/betree/src/replication/mod.rs +++ b/betree/src/replication/mod.rs @@ -42,6 +42,7 @@ use zstd_safe::WriteBuf; mod lru; mod lru_worker; +mod shift_array; mod tree; use lru::Plru; use serde::{Deserialize, Serialize}; diff --git a/betree/src/replication/shift_array.rs b/betree/src/replication/shift_array.rs new file mode 100644 index 00000000..31dc9cf2 --- /dev/null +++ b/betree/src/replication/shift_array.rs @@ -0,0 +1,105 @@ +#[derive(Debug)] +pub struct ShiftArray { + count: usize, + arr: [Option; A], +} + +impl ShiftArray { + /// In an ordered array find the index of the next largest element. + pub fn find(&self, v: &T) -> Option { + for idx in 0..self.count { + if self.arr[idx].as_ref().unwrap() >= v { + return Some(idx); + } + } + Some(self.count) + } +} + +impl ShiftArray { + pub fn new() -> Self { + Self { + arr: [0u8; A].map(|_| None), + count: 0, + } + } + + pub fn iter(&self) -> impl Iterator { + self.arr.iter().filter_map(|e| e.as_ref()) + } + + pub fn split_after(&mut self, idx: usize) -> ShiftArray { + assert!(idx < self.count); + let mut other = Self::new(); + for cur in (idx + 1)..self.count { + other.push_back(self.arr[cur].take().unwrap()); + } + self.count = idx + 1; + other + } + + pub fn push_back(&mut self, val: T) { + // Full + assert!(self.count < A); + self.arr[self.count] = Some(val); + self.count += 1; + } + + pub fn push_front(&mut self, val: T) { + self.insert(0, val) + } + + pub fn get(&self, idx: usize) -> Option<&T> { + self.arr[idx].as_ref() + } + + pub fn get_mut(&mut self, idx: usize) -> Option<&mut T> { + self.arr[idx].as_mut() + } + + pub fn pop_back(&mut self) -> Option { + self.remove(self.count - 1) + } + + pub fn pop_front(&mut self) -> Option { + self.remove(0) + } + + pub fn last(&self) -> Option<&T> { + self.arr.get(self.count.saturating_sub(1)).unwrap().as_ref() + } + + pub fn first(&self) -> Option<&T> { + self.arr[0].as_ref() + } + + pub fn last_mut(&mut self) -> Option<&mut T> { + self.arr + .get_mut(self.count.saturating_sub(1)) + .unwrap() + .as_mut() + } + + pub fn insert(&mut self, idx: usize, val: T) { + assert!(self.count < A); + let mut tmp = Some(val); + for cur in idx..A { + std::mem::swap(&mut tmp, &mut self.arr[cur]) + } + self.count += 1; + } + + pub fn remove(&mut self, idx: usize) -> Option { + let val = self.arr[idx].take(); + // Skip last entry + for cur in idx..A - 1 { + self.arr[cur] = self.arr[cur + 1].take() + } + self.count -= 1; + val + } + + pub fn size(&self) -> usize { + self.count + } +} diff --git a/betree/src/replication/tree.rs b/betree/src/replication/tree.rs index 6300fc4a..2051667a 100644 --- a/betree/src/replication/tree.rs +++ b/betree/src/replication/tree.rs @@ -1,76 +1,252 @@ -use owning_ref::OwningRef; use parking_lot::{RwLock, RwLockReadGuard}; use pmem_hashmap::allocator::{Pal, PalError, PalPtr}; -/// A basic BTree implementation using PalPtr. -/// -/// +use super::shift_array::ShiftArray; // Order of a BTree -const M: usize = 5; +const B: usize = 16; +const NUM_KEYS: usize = B - 1; +const MIN: usize = B / 2 + B % 2; -struct Node { - values: [Option<(K, V)>; M], - // Fine granular locking, could be a way to do some more efficient inserts *while* reading from the tree. - child: [Child>; M + 1], +pub struct Node { + pivots: ShiftArray, + children: ShiftArray, B>, } -enum Child { - Leaf, - Node(PalPtr), +impl std::fmt::Debug for Node { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Node") + .field("pivots", &self.pivots) + .field("count", &self.count()) + .field( + "children", + &self + .children + .iter() + .filter_map(|e| match e { + Link::Entry(val) => None, + Link::Child(n) => Some(n.load()), + }) + .collect::>>(), + ) + .finish() + } +} + +#[derive(Debug)] +pub enum Link { + Entry(V), + Child(PalPtr>), +} + +impl Link { + fn assert_child(&mut self) -> &mut PalPtr> { + match self { + Link::Entry(_) => panic!("Link was not a child."), + Link::Child(c) => c, + } + } } +#[derive(Debug)] pub struct PBTree { root: PalPtr>, } -impl PBTree { - pub fn new(pal: Pal) -> Result { +impl PBTree { + pub fn new(pal: &Pal) -> Result { let mut root = pal.allocate(std::mem::size_of::>())?; root.init(&Node::new(), std::mem::size_of::>()); Ok(Self { root }) } - pub fn get(&self, key: &K) -> Option<&(K, V)> { + pub fn get(&self, key: &K) -> Option<&V> { let mut node = &self.root; loop { + dbg!(node); match node.load().walk(key) { NodeWalk::Miss => return None, - NodeWalk::Found(idx) => return node.load().get(idx).as_ref(), - NodeWalk::Child(idx) => match node.load().child.get(idx).unwrap() { - Child::Node(ref n) => node = n, - Child::Leaf => unreachable!(), - }, + NodeWalk::Found(idx) => return node.load().get(idx), + NodeWalk::Child(idx) => { + match node.load().children.get(idx).unwrap() { + Link::Entry(_) => unreachable!(), + Link::Child(ref n) => node = n, + // Child::Node(ref n) => node = n, + // Child::Leaf => unreachable!(), + } + } } } } - pub fn insert(&mut self, key: K, val: V) { + pub fn remove(&mut self, key: &K) { let mut node = &mut self.root; let mut path = vec![]; loop { path.push(node.clone()); match node.load().walk(&key) { - NodeWalk::Miss => { - if let Some((left, median, right)) = node.load_mut().insert(key, val) { - // Deal with adjacent nodes - todo!(); - for node in path.into_iter().rev() {} + NodeWalk::Miss => break, + NodeWalk::Found(_) => { + // Found leaf containing node, escalate removal upwards until no more changes required + // + // Each deletion can have multiple cases: + // + // - Leafs are fine (OK) + // - Leafs are underfilled: + // - Move elements from neighboring leafs (left or right) and change pivot elements accordingly + // - All other leafs are of size MIN, merge children. + // - Parents contain key as index: Delete and replace with highest element from left child + + if node.load_mut().remove(key) { + // Treat small leaf + // 1. Check if left or right child has enough elements + if path.is_empty() { + // emptied root node + return; + } + let mut parent = path.last_mut().unwrap().load_mut(); + let idx = match parent.walk(key) { + NodeWalk::Child(idx) => idx, + _ => unreachable!(), + }; + + if idx > 0 + && parent + .children + .get_mut(idx - 1) + .unwrap() + .assert_child() + .load() + .size() + > MIN + { + // Pick from left child + let left = parent + .children + .get_mut(idx - 1) + .unwrap() + .assert_child() + .load_mut(); + + let new_child = left.children.pop_back().unwrap(); + let new_pivot = left.pivots.pop_back().unwrap(); + node.load_mut().children.push_front(new_child); + node.load_mut().pivots.push_front(new_pivot); + *parent.pivots.get_mut(idx).unwrap() = left.pivot_high(); + } + + if idx + 1 < B + && parent + .children + .get_mut(idx + 1) + .unwrap() + .assert_child() + .load() + .size() + > MIN + { + // Pick from right child + let right = parent + .children + .get_mut(idx + 1) + .unwrap() + .assert_child() + .load_mut(); + + let new_child = right.children.pop_front().unwrap(); + let new_pivot = right.pivots.pop_front().unwrap(); + node.load_mut().children.push_back(new_child); + node.load_mut().pivots.push_back(new_pivot); + *parent.pivots.get_mut(idx).unwrap() = node.load().pivot_high(); + } + + todo!("Merge children") + } else { + // Remove from parents if they contain the key + for mut n in path.into_iter() { + assert!(!n.load_mut().remove(key)) + } } - return; + break; + } + NodeWalk::Child(idx) => match node.load_mut().children.get_mut(idx).unwrap() { + Link::Entry(_) => unreachable!(), + Link::Child(ref mut n) => node = n, + }, + } + } + } + + pub fn insert(&mut self, key: K, val: V, pal: &Pal) { + if let Some((k, v, n)) = self.insert_from(key, val, pal, self.root.clone()) { + assert!(self.insert_from(k, v, pal, n).is_none()); + } + } + + fn insert_from( + &mut self, + key: K, + val: V, + pal: &Pal, + mut from: PalPtr>, + ) -> Option<(K, V, PalPtr>)> { + let mut node = &mut from; + let mut path = vec![]; + loop { + path.push(node.clone()); + match node.load().walk(&key) { + NodeWalk::Miss => { + return if let Some((median, new_node, value)) = + node.load_mut().insert(key.clone(), val) + { + // Insert facilitated a split, insert new node into parent + let mut pair = Some((median, new_node)).map(|(key, new_node)| { + // Allocate the new node + (key, pal.allocate_variable(new_node).unwrap()) + }); + for mut cur_node in path.iter_mut().rev().skip(1) { + if let Some((key, new_node)) = pair { + pair = cur_node.load_mut().escalate(key, new_node).map( + |(key, new_node)| { + // Allocate the new node + (key, pal.allocate_variable(new_node).unwrap()) + }, + ); + node = cur_node; + } else { + break; + } + } + + // Create a new root node + if let Some((key, new_node)) = pair { + println!("Creating new root"); + let mut new_root = Node::new(); + new_root.pivots.push_front(key); + // new_root.pivots.push_back(new_node.load().pivot_high()); + new_root.children.push_front(Link::Child(self.root)); + new_root.children.push_back(Link::Child(new_node)); + self.root = pal.allocate_variable(new_root).unwrap(); + node = &mut self.root; + } + Some((key, value, node.clone())) + } else { + None + }; } NodeWalk::Found(idx) => { node.load_mut() - .values + .children .get_mut(idx) - .unwrap() - .as_mut() - .map(|entry| entry.1 = val); - return; + .map(|entry| match entry { + Link::Entry(ref mut v) => *v = val, + Link::Child(_) => unreachable!(), + }); + return None; } - NodeWalk::Child(idx) => match node.load_mut().child.get_mut(idx).unwrap() { - Child::Node(ref mut n) => node = n, - Child::Leaf => unreachable!(), + NodeWalk::Child(idx) => match node.load_mut().children.get_mut(idx).unwrap() { + Link::Entry(_) => unreachable!(), + Link::Child(ref mut n) => node = n, }, } } @@ -83,102 +259,265 @@ enum NodeWalk { Child(usize), } -impl Node { +impl Node { pub fn new() -> Self { Node { - values: [0; M].map(|_| None), - child: [0; M + 1].map(|_| Child::Leaf), + pivots: ShiftArray::new(), + children: ShiftArray::new(), } } - pub fn walk(&self, key: &K) -> NodeWalk { - for pos in 0..M { - if let Some(ref pair) = self.values[pos] { - if pair.0 == *key { - return NodeWalk::Found(pos); - } - if pair.0 < *key { - return match self.child[pos] { - Child::Leaf => NodeWalk::Miss, - Child::Node(_) => NodeWalk::Child(pos), - }; - } - } else { - break; + fn walk(&self, key: &K) -> NodeWalk { + let mut idx = 0; + let pos = loop { + if idx >= B - 1 { + break B - 1; + } + if self.pivots.get(idx).is_none() { + break idx; + } + if self.pivots.get(idx).unwrap() == key { + // Inspect Child + return match self.children.get(idx).as_ref().unwrap() { + Link::Entry(_) => NodeWalk::Found(idx), + Link::Child(_) => NodeWalk::Child(idx), + }; + } + if self.pivots.get(idx).unwrap() > key { + break idx; } + idx += 1; + }; + + match self.children.get(pos) { + Some(ref ptr) => match ptr { + Link::Entry(_) => NodeWalk::Miss, + Link::Child(ref child) => NodeWalk::Child(idx), + }, + None => NodeWalk::Miss, } - match self.child[M] { - Child::Leaf => NodeWalk::Miss, - Child::Node(_) => NodeWalk::Child(M), + } + + pub fn insert(&mut self, key: K, value: V) -> Option<(K, Node, V)> { + if self.pivots.size() < NUM_KEYS { + self.splice(key, value); + None + } else { + // Split the node and escalate + let (new_key, mut right) = self.split(); + // assert!(right.insert(key, value).is_none()); + Some((new_key, right, value)) } } - pub fn insert(&mut self, key: K, value: V) -> Option<(Node, (K, V), Node)> { - if self.values.last().is_some() { - // TODO: Split the node and insert value - let mut res = self.split_at(M / 2); - if key <= res.1 .0 { - assert!(res.0.insert(key, value).is_none()); - } else { - assert!(res.2.insert(key, value).is_none()); + pub fn split(&mut self) -> (K, Node) { + assert!(self.pivots.size() == NUM_KEYS); + assert!(self.children.size() >= NUM_KEYS); + const idx: usize = NUM_KEYS / 2 + NUM_KEYS % 2; + + let right_pivots = self.pivots.split_after(idx); + let right_children = self.children.split_after(idx); + + let right = Self { + pivots: right_pivots, + children: right_children, + }; + assert!(self.pivot_high() < right.pivot_low()); + + (self.pivot_high(), right) + } + + pub fn escalate(&mut self, key: K, right: PalPtr>) -> Option<(K, Node)> { + dbg!(&self.pivots.size()); + dbg!(&self.children.size()); + if self.pivots.size() <= NUM_KEYS && self.children.size() < B { + println!("can buffer node"); + // Shift pivot and child + let mut idx = self.pivots.find(&key).unwrap(); + if self.pivots.size() == NUM_KEYS { + let _ = self.pivots.pop_back(); } - Some(res) + // Children space is available, shift + self.pivots.insert(idx, key); + self.children.insert(idx + 1, Link::Child(right)); + None } else { - // Insert entry into remaining space - for entry in self.values.iter_mut() { - if entry.is_none() { - *entry = Some((key, value)); - break; - } + let (upper, mut new_right) = self.split(); + assert!(new_right.escalate(key, right).is_none()); + Some((upper, new_right)) + } + } + + pub fn get(&self, idx: usize) -> Option<&V> { + match self.children.get(idx).as_ref().unwrap() { + Link::Entry(ref v) => Some(v), + Link::Child(_) => None, + } + } + + pub fn pivot_high(&self) -> K { + self.pivots.last().unwrap().clone() + } + + pub fn pivot_low(&self) -> K { + self.pivots.first().unwrap().clone() + } + + /// Returns the number of valid pivot entries. If this number is larger than + /// [MIN], entries may be revoked without tree restructure. + pub fn size(&self) -> usize { + self.pivots.size() + } + + /// Returns true if merge is needed. + pub fn remove(&mut self, key: &K) -> bool { + let idx = self.pivots.find(key).unwrap(); + let remove_pivot; + match self.children.get_mut(idx).unwrap() { + Link::Entry(_) => { + self.pivots.remove(idx); + remove_pivot = true; + } + Link::Child(c) => { + *self.pivots.get_mut(idx).unwrap() = c.load().pivot_high(); + remove_pivot = false; } - None } + if remove_pivot { + self.pivots.remove(idx); + } + self.pivots.size() < MIN } - pub fn get(&self, idx: usize) -> &Option<(K, V)> { - self.values.get(idx).unwrap() + pub fn splice(&mut self, mut key: K, mut val: V) { + assert!(self.pivots.size() < NUM_KEYS); + let idx = self.pivots.find(&key).unwrap_or(0); + self.pivots.insert(idx, key); + // This may not work + self.children.insert(idx, Link::Entry(val)); } - pub fn remove(&mut self, key: K) -> Option<(K, V)> { - todo!() + + pub fn count(&self) -> usize { + self.children + .iter() + .map(|e| match e { + Link::Entry(e) => 1, + Link::Child(c) => c.load().count(), + }) + .sum() } +} - pub fn splice_at(&mut self, kv: (K, V), idx: usize) { - assert!(idx > 0); - assert!(idx < M + 1); - assert!(self.values[M - 1].is_none()); - for cur in (idx..M).rev() { - self.values[cur] = self.values[cur - 1].take(); +#[cfg(test)] +mod tests { + use super::*; + use pmem_hashmap::allocator::Pal; + use std::{collections::HashSet, path::PathBuf, process::Command}; + use tempfile::Builder; + + struct TestFile(PathBuf); + + impl TestFile { + pub fn new() -> Self { + TestFile( + Builder::new() + .tempfile() + .expect("Could not get tmpfile") + .path() + .to_path_buf(), + ) } + + pub fn path(&self) -> &PathBuf { + &self.0 + } + } + impl Drop for TestFile { + fn drop(&mut self) { + if !Command::new("rm") + .arg(self.0.to_str().expect("Could not pass tmpfile")) + .output() + .expect("Could not delete") + .status + .success() + { + eprintln!("Could not delete tmpfile"); + } + } + } + + #[test] + fn new() { + let file = TestFile::new(); + let mut pal = Pal::create(file.path(), 32 * 1024 * 1024, 0o666).unwrap(); + let tree: PBTree = PBTree::new(&pal).unwrap(); + } + + #[test] + fn basic_insert() { + let file = TestFile::new(); + let mut pal = Pal::create(file.path(), 32 * 1024 * 1024, 0o666).unwrap(); + let mut tree: PBTree = PBTree::new(&pal).unwrap(); + tree.insert(1, 1, &pal); } - // Move left and right section of keys down to the - pub fn split_at(&mut self, idx: usize) -> (Node, (K, V), Node) { - let mut left = Self::new(); - let mut right = Self::new(); - let mut cur = 0; + #[test] + fn basic_get() { + let file = TestFile::new(); + let mut pal = Pal::create(file.path(), 32 * 1024 * 1024, 0o666).unwrap(); + let mut tree: PBTree = PBTree::new(&pal).unwrap(); + assert!(tree.get(&1).is_none()); + tree.insert(1, 1, &pal); + assert_eq!(tree.get(&1), Some(&1)); + } + + #[test] + fn seq_insert() { + let file = TestFile::new(); + let mut pal = Pal::create(file.path(), 128 * 1024 * 1024, 0o666).unwrap(); + let mut tree: PBTree = PBTree::new(&pal).unwrap(); - for (pos, c) in left.values.iter_mut().zip(left.child.iter_mut()) { - if cur > idx { - break; + for id in 0..=255 { + println!("{id}"); + tree.insert(id, id, &pal); + for n in 0..=id { + println!("id: {n}"); + assert_eq!(tree.get(&n), Some(&n)); } - *pos = self.values[cur].take(); - *c = std::mem::replace(&mut self.child[cur], Child::Leaf); - cur += 1; } - let median = self.values[cur].take().unwrap(); - cur += 1; + for id in 0..=255 { + assert_eq!(tree.get(&id), Some(&id)); + } + dbg!(tree.root.load()); + } + + #[test] + fn rnd_insert() { + let file = TestFile::new(); + let mut pal = Pal::create(file.path(), 128 * 1024 * 1024, 0o666).unwrap(); + let mut tree = PBTree::new(&pal).unwrap(); + + use rand::Rng; + let mut rng = rand::thread_rng(); + let vals = [0u8; 256].map(|_| rng.gen::()); + let set = HashSet::from(vals); - for (pos, c) in right.values.iter_mut().zip(right.child.iter_mut()) { - if cur == M { - break; + let mut inserted = vec![]; + for id in set.iter() { + dbg!(tree.root.load().count()); + tree.insert(id, id, &pal); + dbg!(tree.root.load().count()); + inserted.push(id); + for x in inserted.iter() { + if tree.get(x) != Some(x) { + assert_eq!(x, &&0); + } } - *pos = self.values[cur].take(); - *c = std::mem::replace(&mut self.child[cur], Child::Leaf); - cur += 1; } - right.child[cur - idx + 1] = std::mem::replace(&mut self.child[cur], Child::Leaf); - (left, median, right) + for id in set.iter() { + assert_eq!(tree.get(&id), Some(&id)); + } } }