diff --git a/src/btree.rs b/src/btree.rs new file mode 100644 index 00000000..559e3566 --- /dev/null +++ b/src/btree.rs @@ -0,0 +1,241 @@ +use crate::btree::Node::{Internal, Leaf}; +use std::cell::Cell; +use std::cmp::Ordering; +use std::convert::TryInto; + +fn write_vec(value: &[u8], output: &mut [u8], mut index: usize) -> usize { + output[index..(index + 8)].copy_from_slice(&(value.len() as u64).to_be_bytes()); + index += 8; + output[index..(index + value.len())].copy_from_slice(value); + index += value.len(); + index +} + +// Returns the (offset, len) of the value for the queried key, if present +pub(in crate) fn lookup_in_raw( + tree: &[u8], + query: &[u8], + mut index: usize, +) -> Option<(usize, usize)> { + match tree[index] { + 1 => { + index += 1; + let key_len = u64::from_be_bytes(tree[index..(index + 8)].try_into().unwrap()) as usize; + index += 8; + match query.cmp(&tree[index..(index + key_len)]) { + Ordering::Less => None, + Ordering::Equal => { + index += key_len; + let value_len = + u64::from_be_bytes(tree[index..(index + 8)].try_into().unwrap()) as usize; + index += 8; + Some((index, value_len)) + } + Ordering::Greater => { + index += key_len; + let value_len = + u64::from_be_bytes(tree[index..(index + 8)].try_into().unwrap()) as usize; + index += 8 + value_len; + let second_key_len = + u64::from_be_bytes(tree[index..(index + 8)].try_into().unwrap()) as usize; + index += 8; + if query == &tree[index..(index + second_key_len)] { + index += second_key_len; + let value_len = + u64::from_be_bytes(tree[index..(index + 8)].try_into().unwrap()) + as usize; + index += 8; + Some((index, value_len)) + } else { + None + } + } + } + } + 2 => { + index += 1; + let key_len = u64::from_be_bytes(tree[index..(index + 8)].try_into().unwrap()) as usize; + index += 8; + index += key_len; + if query <= &tree[(index - key_len)..index] { + // Skip left-node length + index += 8; + } else { + // Skip left-node + let left_node_len = + u64::from_be_bytes(tree[index..(index + 8)].try_into().unwrap()) as usize; + index += 8 + left_node_len; + } + lookup_in_raw(tree, query, index) + } + _ => unreachable!(), + } +} + +#[derive(Eq, PartialEq, Debug)] +pub(in crate) enum Node { + Leaf((Vec, Vec), Option<(Vec, Vec)>), + Internal(Box, Vec, Box), +} + +impl Node { + pub(in crate) fn recursive_size(&self) -> usize { + match self { + Node::Leaf(left_val, right_val) => { + let mut size = 1; // 1 byte for node type + size += 8 + left_val.0.len() + 8 + left_val.1.len(); + if let Some(right) = right_val { + size += 8 + right.0.len() + 8 + right.1.len(); + } else { + // empty right val stored as a single 0 length key + size += 8; + } + size + } + Node::Internal(left, key, right) => { + let mut size = 1; // 1 byte for node type + size += 8 + key.len(); + // Reserve space to prefix the left node with its size, so that we can skip it + // efficiently + size += 8 + left.recursive_size(); + size += right.recursive_size(); + size + } + } + } + + // Returns the index following the last written + pub(in crate) fn to_bytes(&self, output: &mut [u8], mut index: usize) -> usize { + match self { + Node::Leaf(left_val, right_val) => { + output[index] = 1; + index += 1; + index = write_vec(&left_val.0, output, index); + index = write_vec(&left_val.1, output, index); + if let Some(right) = right_val { + index = write_vec(&right.0, output, index); + index = write_vec(&right.1, output, index); + } else { + // empty right val stored as a single 0 length key + index = write_vec(&[], output, index); + } + } + Node::Internal(left, key, right) => { + output[index] = 2; + index += 1; + index = write_vec(key, output, index); + let left_size = left.recursive_size(); + output[index..(index + 8)].copy_from_slice(&(left_size as u64).to_be_bytes()); + index += 8; + index = left.to_bytes(output, index); + index = right.to_bytes(output, index); + } + } + + index + } + + fn get_max_key(&self) -> Vec { + match self { + Node::Leaf((left_key, _), right_val) => { + if let Some((right_key, _)) = right_val { + right_key.to_vec() + } else { + left_key.to_vec() + } + } + Node::Internal(_left, _key, right) => right.get_max_key(), + } + } +} + +pub(in crate) struct BtreeBuilder { + pairs: Vec<(Vec, Vec)>, +} + +impl BtreeBuilder { + pub(in crate) fn new() -> BtreeBuilder { + BtreeBuilder { pairs: vec![] } + } + + pub(in crate) fn add(&mut self, key: &[u8], value: &[u8]) { + self.pairs.push((key.to_vec(), value.to_vec())); + } + + pub(in crate) fn build(mut self) -> Node { + assert!(!self.pairs.is_empty()); + self.pairs.sort(); + let mut leaves = vec![]; + for group in self.pairs.chunks(2) { + let leaf = if group.len() == 1 { + Leaf((group[0].0.to_vec(), group[0].1.to_vec()), None) + } else { + assert_eq!(group.len(), 2); + if group[0].0 == group[1].0 { + todo!("support overwriting existing keys"); + } + Leaf( + (group[0].0.to_vec(), group[0].1.to_vec()), + Some((group[1].0.to_vec(), group[1].1.to_vec())), + ) + }; + leaves.push(leaf); + } + + let mut bottom = leaves; + let maybe_previous_node: Cell> = Cell::new(None); + while bottom.len() > 1 { + let mut internals = vec![]; + for node in bottom.drain(..) { + if let Some(previous_node) = maybe_previous_node.take() { + let key = previous_node.get_max_key(); + let internal = Internal(Box::new(previous_node), key, Box::new(node)); + internals.push(internal) + } else { + maybe_previous_node.set(Some(node)); + } + } + if let Some(previous_node) = maybe_previous_node.take() { + internals.push(previous_node); + } + + bottom = internals + } + + bottom.pop().unwrap() + } +} + +#[cfg(test)] +mod test { + use crate::btree::Node::{Internal, Leaf}; + use crate::btree::{BtreeBuilder, Node}; + + fn gen_tree() -> Node { + let left = Leaf( + (b"hello".to_vec(), b"world".to_vec()), + Some((b"hello2".to_vec(), b"world2".to_vec())), + ); + let right = Leaf((b"hello3".to_vec(), b"world3".to_vec()), None); + Internal(Box::new(left), b"hello2".to_vec(), Box::new(right)) + } + + #[test] + fn serialize() { + let internal = gen_tree(); + let size = internal.recursive_size(); + let mut buffer = vec![0u8; size]; + internal.to_bytes(&mut buffer, 0); + } + + #[test] + fn builder() { + let expected = gen_tree(); + let mut builder = BtreeBuilder::new(); + builder.add(b"hello2", b"world2"); + builder.add(b"hello3", b"world3"); + builder.add(b"hello", b"world"); + + assert_eq!(expected, builder.build()); + } +} diff --git a/src/lib.rs b/src/lib.rs index bc9b2ea8..c6546f89 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +mod btree; mod db; mod error; #[cfg(feature = "python")] diff --git a/src/storage.rs b/src/storage.rs index fa481961..077301e7 100644 --- a/src/storage.rs +++ b/src/storage.rs @@ -1,3 +1,4 @@ +use crate::btree::{lookup_in_raw, BtreeBuilder}; use crate::Error; use memmap2::MmapMut; use std::cell::{Ref, RefCell}; @@ -5,8 +6,8 @@ use std::convert::TryInto; use std::ops::Deref; const MAGICNUMBER: [u8; 4] = [b'r', b'e', b'd', b'b']; -const ENTRIES_OFFSET: usize = MAGICNUMBER.len(); -const DATA_OFFSET: usize = ENTRIES_OFFSET + 8; +const DATA_LEN: usize = MAGICNUMBER.len(); +const DATA_OFFSET: usize = DATA_LEN + 8; pub(in crate) struct Storage { mmap: RefCell, @@ -24,7 +25,7 @@ impl Storage { if mmap[0..MAGICNUMBER.len()] == MAGICNUMBER { return Ok(()); } - mmap[ENTRIES_OFFSET..(ENTRIES_OFFSET + 8)].copy_from_slice(&0u64.to_be_bytes()); + mmap[DATA_LEN..(DATA_LEN + 8)].copy_from_slice(&0u64.to_be_bytes()); mmap.flush()?; // Write the magic number only after the data structure is initialized and written to disk // to ensure that it's crash safe @@ -36,28 +37,12 @@ impl Storage { pub(in crate) fn append(&self, key: &[u8], value: &[u8]) -> Result<(), Error> { let mut mmap = self.mmap.borrow_mut(); - let mut entries = u64::from_be_bytes( - mmap[ENTRIES_OFFSET..(ENTRIES_OFFSET + 8)] - .try_into() - .unwrap(), - ) as usize; + let mut data_len = + u64::from_be_bytes(mmap[DATA_LEN..(DATA_LEN + 8)].try_into().unwrap()) as usize; - let mut index = DATA_OFFSET; - let mut entry = 0; - while entry < entries { - let key_len = u64::from_be_bytes(mmap[index..(index + 8)].try_into().unwrap()) as usize; - index += 8; - if key == &mmap[index..(index + key_len)] { - todo!("support overwriting existing keys"); - } - index += key_len; - let value_len = - u64::from_be_bytes(mmap[index..(index + 8)].try_into().unwrap()) as usize; - index += 8 + value_len; - entry += 1; - } + let mut index = DATA_OFFSET + data_len; - // Does not exist, append the new key & value + // Append the new key & value mmap[index..(index + 8)].copy_from_slice(&(key.len() as u64).to_be_bytes()); index += 8; mmap[index..(index + key.len())].copy_from_slice(key); @@ -65,50 +50,60 @@ impl Storage { mmap[index..(index + 8)].copy_from_slice(&(value.len() as u64).to_be_bytes()); index += 8; mmap[index..(index + value.len())].copy_from_slice(value); + index += value.len(); - entries += 1; - mmap[ENTRIES_OFFSET..(ENTRIES_OFFSET + 8)].copy_from_slice(&entries.to_be_bytes()); - Ok(()) - } + data_len = index - DATA_OFFSET; - pub(in crate) fn fsync(&self) -> Result<(), Error> { - self.mmap.borrow().flush()?; + mmap[DATA_LEN..(DATA_LEN + 8)].copy_from_slice(&data_len.to_be_bytes()); Ok(()) } - pub(in crate) fn get(&self, key: &[u8]) -> Result, Error> { - let mmap = self.mmap.borrow(); + pub(in crate) fn fsync(&self) -> Result<(), Error> { + let mut builder = BtreeBuilder::new(); + let mut mmap = self.mmap.borrow_mut(); - let entries = u64::from_be_bytes( - mmap[ENTRIES_OFFSET..(ENTRIES_OFFSET + 8)] - .try_into() - .unwrap(), - ) as usize; + let data_len = + u64::from_be_bytes(mmap[DATA_LEN..(DATA_LEN + 8)].try_into().unwrap()) as usize; let mut index = DATA_OFFSET; - let mut entry = 0; - while entry < entries { + while index < (DATA_OFFSET + data_len) { let key_len = u64::from_be_bytes(mmap[index..(index + 8)].try_into().unwrap()) as usize; index += 8; - let value_len = u64::from_be_bytes( - mmap[(index + key_len)..(index + key_len + 8)] - .try_into() - .unwrap(), - ) as usize; - if key == &mmap[index..(index + key_len)] { - index += key_len + 8; - let guard = AccessGuard { - mmap_ref: mmap, - offset: index, - len: value_len, - }; - return Ok(Some(guard)); - } - index += key_len + 8 + value_len; - entry += 1; + let key = &mmap[index..(index + key_len)]; + index += key_len; + let value_len = + u64::from_be_bytes(mmap[index..(index + 8)].try_into().unwrap()) as usize; + index += 8; + let value = &mmap[index..(index + value_len)]; + index += value_len; + builder.add(key, value); } - Ok(None) + let node = builder.build(); + assert!(DATA_OFFSET + data_len + node.recursive_size() < mmap.len()); + node.to_bytes(&mut mmap[(DATA_OFFSET + data_len)..], 0); + + mmap.flush()?; + Ok(()) + } + + pub(in crate) fn get(&self, key: &[u8]) -> Result, Error> { + let mmap = self.mmap.borrow(); + + let data_len = + u64::from_be_bytes(mmap[DATA_LEN..(DATA_LEN + 8)].try_into().unwrap()) as usize; + + let index = DATA_OFFSET + data_len; + if let Some((offset, len)) = lookup_in_raw(&mmap, key, index) { + let guard = AccessGuard { + mmap_ref: mmap, + offset, + len, + }; + Ok(Some(guard)) + } else { + Ok(None) + } } }