From fe052537ed35f964c599e0fc309c5977109d9795 Mon Sep 17 00:00:00 2001 From: Tuomas Pirhonen Date: Thu, 1 Feb 2024 22:18:42 +0100 Subject: [PATCH] add batched write --- src/lib.rs | 88 +++++++++---------- src/nvme.rs | 234 ++++++++++++++++++++++++++++++++------------------ src/queues.rs | 32 +++++-- 3 files changed, 220 insertions(+), 134 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index fb3cd34..3b7272c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,25 +15,31 @@ use self::pci::*; use std::error::Error; use std::time::Instant; -use std::io::Read; -use std::fs::File; +// use std::io::Read; +// use std::fs::File; #[cfg(target_arch = "aarch64")] #[inline(always)] -pub(crate) unsafe fn pause() { - std::arch::aarch64::__yield(); +pub(crate) fn pause() { + unsafe { + std::arch::aarch64::__yield(); + } } #[cfg(target_arch = "x86")] #[inline(always)] -pub(crate) unsafe fn pause() { - std::arch::x86::_mm_pause(); +pub(crate) fn pause() { + unsafe { + std::arch::x86::_mm_pause(); + } } #[cfg(target_arch = "x86_64")] #[inline(always)] -pub(crate) unsafe fn pause() { - std::arch::x86_64::_mm_pause(); +pub(crate) fn pause() { + unsafe { + std::arch::x86_64::_mm_pause(); + } } pub fn init(pci_addr: &str) -> Result<(), Box> { @@ -66,45 +72,41 @@ pub fn init(pci_addr: &str) -> Result<(), Box> { // Testing stuff let n = 1000; - let blocks = 24; - let mut lba = 0; + let n2 = 100; + let blocks = 128; let mut read = std::time::Duration::new(0, 0); let mut write = std::time::Duration::new(0, 0); + let mut write_batched = std::time::Duration::new(0, 0); + let mut read_buf = vec![0; blocks * 512]; + + for _ in 0..n2 { + let mut lba = 0; + for _ in 0..n { + // write + let rand_block = &(0.. (512 * blocks)).map(|_| { rand::random::() }).collect::>()[..]; + let before = Instant::now(); + nvme.write_raw(rand_block, lba)?; + write += before.elapsed(); + + let before = Instant::now(); + nvme.batched_write(1, rand_block, lba, 16)?; + write_batched += before.elapsed(); + + // read + let before = Instant::now(); + nvme.read(1, &mut read_buf[..], lba)?; + read += before.elapsed(); + // assert_eq!(read_buf, rand_block); + + lba += blocks as u64; + // nvme.read(1, 4); + } + } + let n = n * n2; - let mut shakespeare = File::open("pg100.txt")?; - let mut buffer = Vec::new(); - shakespeare.read_to_end(&mut buffer)?; - let before = Instant::now(); - nvme.write_raw(&buffer[..], lba)?; - write += before.elapsed(); - - // for _ in 0..n { - // // read - // let before = Instant::now(); - - // // this guy doesn't work when writing more than 2 pages?? - // nvme.read(1, blocks, lba); - // read += before.elapsed(); - // // println!("{blocks} block read: {:?}", before.elapsed()); - // let rand_block = &(0.. (512 * blocks)).map(|_| { rand::random::() }).collect::>()[..]; - - // assert_eq!(rand_block.len(), 512 * blocks as usize); - - // // write - // let before = Instant::now(); - // nvme.write_raw(rand_block, lba)?; - // write += before.elapsed(); - // // println!("{blocks} block write: {:?}", before.elapsed()); - // let data = nvme.read_tmp(1, blocks, lba); - // assert_eq!(data, rand_block); - - // lba += blocks as u64; - // // nvme.read(1, 4); - // } - - // println!("{blocks} block read: {:?}", read / n); println!("{blocks} block write: {:?}", write / n); - + println!("{blocks} block batched write: {:?}", write_batched / n); + println!("{blocks} block read: {:?}", read / n); Ok(()) } diff --git a/src/nvme.rs b/src/nvme.rs index 317b0e9..2f4a4d4 100644 --- a/src/nvme.rs +++ b/src/nvme.rs @@ -1,6 +1,7 @@ use crate::cmd::NvmeCommand; use crate::memory::Dma; use crate::memory::HUGE_PAGE_SIZE; +// use crate::memory::HUGE_PAGE_SIZE; use crate::pci::pci_map_resource; use crate::queues::*; use crate::NvmeNamespace; @@ -105,7 +106,7 @@ pub struct NvmeDevice { sub_queues: Vec, comp_queues: Vec, buffer: Dma<[u8; 2048 * 1024]>, // 2MiB of buffer - prp_list: Dma<[u64; 512]>, // Address of PRP's, devices doesn't necessarily support 2MiB page sizes + prp_list: Dma<[u64; 512]>, // Address of PRP's, devices doesn't necessarily support 2MiB page sizes; 8 Bytes * 512 = 4096 namespaces: HashMap, } @@ -130,11 +131,10 @@ impl NvmeDevice { sub_queues: vec![], comp_queues: vec![], buffer: Dma::allocate(crate::memory::HUGE_PAGE_SIZE, true)?, - prp_list: Dma::allocate(64 * 512, true)?, + prp_list: Dma::allocate(8 * 512, true)?, namespaces: HashMap::new(), }; - // technically not correct for i in 1..512 { unsafe { (*dev.prp_list.virt)[i - 1] = (dev.buffer.phys + i * 4096) as u64; @@ -259,7 +259,12 @@ impl NvmeDevice { }); let status = comp.status >> 1; if status != 0 { - eprintln!("Status: 0x{:x}, Status Code 0x{:x}, Status Code Type: 0x{:x}", status, status & 0xFF, (status >> 8) & 0x7); + eprintln!( + "Status: 0x{:x}, Status Code 0x{:x}, Status Code Type: 0x{:x}", + status, + status & 0xFF, + (status >> 8) & 0x7 + ); println!("something went awry: 0x{:x}", status) } @@ -279,7 +284,12 @@ impl NvmeDevice { }); let status = comp.status >> 1; if status != 0 { - eprintln!("Status: 0x{:x}, Status Code 0x{:x}, Status Code Type: 0x{:x}", status, status & 0xFF, (status >> 8) & 0x7); + eprintln!( + "Status: 0x{:x}, Status Code 0x{:x}, Status Code Type: 0x{:x}", + status, + status & 0xFF, + (status >> 8) & 0x7 + ); println!("something went awry: 0x{:x}", status) } @@ -336,20 +346,44 @@ impl NvmeDevice { namespace } - //TODO: fix PRP's, broken when reading > 8192 bytes; same for write - pub fn namespace_read( + pub fn read( + &mut self, + ns_id: u32, + dest: &mut [u8], + mut lba: u64, + ) -> Result<(), Box> { + let ns = *self.namespaces.get(&ns_id).unwrap(); + + // for chunk in dest.chunks_mut(HUGE_PAGE_SIZE) { + for chunk in dest.chunks_mut(128 * 4096) { + let blocks = (chunk.len() + ns.block_size as usize - 1) / ns.block_size as usize; + self.namespace_io(&ns, blocks as u64, lba, false)?; + chunk.copy_from_slice(&unsafe { (*self.buffer.virt) }[..chunk.len()]); + lba += blocks as u64; + } + // self.read_lba(ns_id, blocks, lba) + Ok(()) + } + + pub fn write_string(&mut self, data: String, lba: u64) -> Result<(), Box> { + self.write_raw(data.as_bytes(), lba) + } + + pub fn submit_io( &mut self, ns: &NvmeNamespace, - blocks: u32, + addr: u64, + blocks: u64, lba: u64, - ) -> Result> { + write: bool, + ) -> Option { assert!(blocks > 0); assert!(blocks <= 0x1_0000); - let q_id = self.comp_queues.len(); - let io_q = self.sub_queues.get_mut(q_id - 1).unwrap(); + let q_id = self.sub_queues.len(); + let mut io_q = &mut self.sub_queues[q_id - 1]; - let bytes = blocks as u64 * ns.block_size; + let bytes = blocks * ns.block_size; let ptr1 = if bytes <= 4096 { 0 } else if bytes <= 8192 { @@ -358,80 +392,87 @@ impl NvmeDevice { self.prp_list.phys as u64 }; - let entry = NvmeCommand::io_read( - io_q.tail as u16, - ns.id, - lba, - blocks as u16 - 1, - self.buffer.phys as u64, - ptr1 - ); - - let tail = io_q.submit(entry); - self.write_reg_idx(NvmeArrayRegs::SQyTDBL, q_id as u16, tail as u32); - - let c_q = self.comp_queues.get_mut(q_id - 1).unwrap(); - let (tail, entry, _) = c_q.complete_spin(); - self.write_reg_idx(NvmeArrayRegs::CQyHDBL, q_id as u16, tail as u32); - - let status = entry.status >> 1; - if status != 0 { - eprintln!("Status: 0x{:x}, Status Code 0x{:x}, Status Code Type: 0x{:x}", status, status & 0xFF, (status >> 8) & 0x7); - return Err("Read fail".into()); - } - - Ok(blocks as usize * ns.block_size as usize) - } - - pub fn read(&mut self, ns_id: u32, blocks: u32, lba: u64) { - self.read_lba(ns_id, blocks, lba) - } - - pub fn read_tmp(&mut self, ns_id: u32, blocks: u32, lba: u64) -> Vec { - let namespace = *self.namespaces.get(&ns_id).unwrap(); - let bytes = self.namespace_read(&namespace, blocks, lba).unwrap(); - - let buff = unsafe { *self.buffer.virt }; - let mut vec = vec![0; bytes]; - vec.clone_from_slice(&buff[0..bytes]); - vec + let entry = if write { + NvmeCommand::io_write(io_q.tail as u16, ns.id, lba, blocks as u16 - 1, addr, ptr1) + } else { + NvmeCommand::io_read(io_q.tail as u16, ns.id, lba, blocks as u16 - 1, addr, ptr1) + }; + io_q.submit_checked(entry) } - pub fn read_lba(&mut self, ns_id: u32, blocks: u32, lba: u64) { - let namespace = *self.namespaces.get(&ns_id).unwrap(); - let bytes = self.namespace_read(&namespace, blocks, lba).unwrap(); - - // let mut data = String::new(); - // let buff = unsafe { *self.buffer.virt }; - // for &b in &buff[0..bytes] { - // data.push(b as char); - // } - // println!("{data}"); - } + pub fn batched_write( + &mut self, + ns_id: u32, + data: &[u8], + mut lba: u64, + batch_len: u64, + ) -> Result<(), Box> { + let ns = *self.namespaces.get(&ns_id).unwrap(); + let block_size = 512; + let q_id = 1; + + for chunk in data.chunks(HUGE_PAGE_SIZE) { + unsafe { (*self.buffer.virt)[..chunk.len()].copy_from_slice(chunk) } + let mut tail = self.sub_queues[q_id - 1].tail; + + let batch_len = std::cmp::min(batch_len, chunk.len() as u64 / block_size); + let batch_size = chunk.len() as u64 / batch_len; + let blocks = batch_size / ns.block_size; + for i in 0..batch_len { + if let Some(new_tail) = self.submit_io( + &ns, + self.buffer.phys as u64 + i * batch_size, + blocks, + lba, + true, + ) { + tail = new_tail; + } else { + eprintln!("tail: {tail}, batch_len: {batch_len}, batch_size: {batch_size}, blocks: {blocks}"); + tail = tail; + } + lba += blocks; + } + self.write_reg_idx(NvmeArrayRegs::SQyTDBL, q_id as u16, tail as u32); + let c_q = &mut self.comp_queues[q_id - 1]; + + let (tail, c_entry, _) = c_q.complete_n(batch_len as usize); + self.write_reg_idx(NvmeArrayRegs::CQyHDBL, q_id as u16, tail as u32); + + let status = c_entry.status >> 1; + if status != 0 { + eprintln!( + "Status: 0x{:x}, Status Code 0x{:x}, Status Code Type: 0x{:x}", + status, + status & 0xFF, + (status >> 8) & 0x7 + ); + eprintln!("{:?}", c_entry); + return Err("Write fail".into()); + } + let head = c_entry.sq_head; + self.sub_queues[0].head = c_entry.sq_head as usize; + } - pub fn write_string(&mut self, data: String, lba: u64) -> Result<(), Box> { - self.write_raw(data.as_bytes(), lba) + Ok(()) } - //TODO: ugly - pub fn namespace_write( + pub fn namespace_io( &mut self, ns: &NvmeNamespace, blocks: u64, lba: u64, + write: bool, ) -> Result<(), Box> { assert!(blocks > 0); assert!(blocks <= 0x1_0000); - // let mut io_q = self.sub_queues.pop().unwrap(); - // let mut c_q = elf.comp_queues.pop().unwrap(); - let q_id = self.sub_queues.len(); - let io_q = self.sub_queues.get_mut(q_id - 1).unwrap(); - - - // TODO: fix - let bytes = blocks as u64 * ns.block_size; + let mut io_q = self.sub_queues.pop().unwrap(); + let mut c_q = self.comp_queues.pop().unwrap(); + // let io_q = self.sub_queues.get_mut(q_id - 1).unwrap(); + // + let bytes = blocks * ns.block_size; let ptr1 = if bytes <= 4096 { 0 } else if bytes <= 8192 { @@ -441,26 +482,49 @@ impl NvmeDevice { }; // println!("blocks to write {}", blocks); - let entry = NvmeCommand::io_write( - io_q.tail as u16, - ns.id, - lba, - blocks as u16 - 1, - self.buffer.phys as u64, - ptr1, - ); + let entry = if write { + NvmeCommand::io_write( + io_q.tail as u16, + ns.id, + lba, + blocks as u16 - 1, + self.buffer.phys as u64, + ptr1, + ) + } else { + NvmeCommand::io_read( + io_q.tail as u16, + ns.id, + lba, + blocks as u16 - 1, + self.buffer.phys as u64, + ptr1, + ) + }; + let tail = io_q.submit(entry); self.write_reg_idx(NvmeArrayRegs::SQyTDBL, q_id as u16, tail as u32); - let c_q = self.comp_queues.get_mut(q_id - 1).unwrap(); - let (tail, entry, _) = c_q.complete_spin(); + // let c_q = self.comp_queues.get_mut(q_id - 1).unwrap(); + let (tail, c_entry, _) = c_q.complete_spin(); self.write_reg_idx(NvmeArrayRegs::CQyHDBL, q_id as u16, tail as u32); - let status = entry.status >> 1; + let status = c_entry.status >> 1; if status != 0 { - eprintln!("Status: 0x{:x}, Status Code 0x{:x}, Status Code Type: 0x{:x}", status, status & 0xFF, (status >> 8) & 0x7); + eprintln!( + "Status: 0x{:x}, Status Code 0x{:x}, Status Code Type: 0x{:x}", + status, + status & 0xFF, + (status >> 8) & 0x7 + ); + eprintln!("{:?}", entry); + eprintln!("{:?}", c_entry); return Err("Write fail".into()); } + io_q.head = c_entry.sq_head as usize; + + self.sub_queues.push(io_q); + self.comp_queues.push(c_q); Ok(()) } @@ -476,7 +540,7 @@ impl NvmeDevice { } let blocks = (chunk.len() + ns.block_size as usize - 1) / ns.block_size as usize; - self.namespace_write(&ns, blocks as u64, lba)?; + self.namespace_io(&ns, blocks as u64, lba, true)?; lba += blocks as u64; } diff --git a/src/queues.rs b/src/queues.rs index d2f7834..41ce217 100644 --- a/src/queues.rs +++ b/src/queues.rs @@ -30,7 +30,7 @@ pub const QUEUE_LENGTH: usize = 1024; pub struct NvmeSubQueue { // TODO: switch to mempool for larger queue pub commands: Dma<[NvmeCommand; QUEUE_LENGTH]>, - head: usize, + pub head: usize, pub tail: usize, } @@ -49,14 +49,23 @@ impl NvmeSubQueue { pub fn is_empty(&self) -> bool { self.head == self.tail } + pub fn is_full(&self) -> bool { - self.head == self.tail + 1 + self.head == (self.tail + 1) % QUEUE_LENGTH } + pub fn submit_checked(&mut self, entry: NvmeCommand) -> Option { + if self.is_full() { + None + } else { + Some(self.submit(entry)) + } + } + + #[inline(always)] pub fn submit(&mut self, entry: NvmeCommand) -> usize { // println!("SUBMISSION ENTRY: {:?}", entry); unsafe { - // seems legit (*self.commands.virt)[self.tail] = entry; } self.tail = (self.tail + 1) % QUEUE_LENGTH; @@ -102,14 +111,25 @@ impl NvmeCompQueue { } } + /// + pub fn complete_n(&mut self, commands: usize) -> (usize, NvmeCompletion, usize) { + let prev = self.head; + self.head += commands - 1; + if self.head >= QUEUE_LENGTH { + self.phase = !self.phase; + } + self.head %= QUEUE_LENGTH; + + let (head, entry, _) = self.complete_spin(); + (head, entry, prev) + } + pub fn complete_spin(&mut self) -> (usize, NvmeCompletion, usize) { loop { if let Some(val) = self.complete() { return val; } else { - unsafe { - super::pause(); - }; + super::pause(); } } }