//! Focused microbenchmarks for the shared global freelist. //! //! `BufferPool` keeps free pooled buffers in a global freelist that is shared //! across threads. Threads hit this structure when they refill or spill their //! thread-local caches. //! //! This module benchmarks that global freelist directly and compares three //! implementations behind the same batch-oriented slot-id interface: //! //! - [`Freelist`]: a striped atomic bitmap freelist //! - `Mutex>`: a simple locked batched baseline //! - `ArrayQueue<_>`: a bounded lock-free queue baseline //! //! Each worker repeatedly removes `batch` entries, then returns the same //! entries, keeping occupancy stable throughout the run. This matches the //! steady-state shape of multi-threaded freelist reuse. //! //! The benchmarked entries are synthetic slot ids paired with a small //! [`PooledBuffer`]. That keeps the shape close to the real pooled freelist //! while avoiding unrelated `BufferPool` logic. use super::utils::{measure, Threading}; use commonware_runtime::iobuf::bench::{Freelist, PooledBuffer}; use commonware_utils::sync::Mutex; use criterion::Criterion; use crossbeam_queue::ArrayQueue; use std::{ alloc::Layout, hint::black_box, num::{NonZeroU32, NonZeroUsize}, sync::Arc, }; const SLOTS: &[usize] = &[16, 64, 512]; const BATCH_SIZES: &[usize] = &[1, 2, 4, 8, 16, 32]; const BENCH_BUFFER_CAPACITY: usize = 256; const BENCH_BUFFER_ALIGNMENT: usize = 64; const BENCH_LAYOUT: Layout = match Layout::from_size_align(BENCH_BUFFER_CAPACITY, BENCH_BUFFER_ALIGNMENT) { Ok(layout) => layout, Err(_) => panic!("valid bench layout"), }; #[derive(Debug)] struct Entry { slot: u32, buffer: PooledBuffer, } impl Entry { fn new(slot: usize) -> Self { Self { slot: slot as u32, buffer: PooledBuffer::new(BENCH_LAYOUT), } } } trait FreelistImplementation: Send + Sync { fn as_str() -> &'static str; fn with_capacity(capacity: usize, parallelism: usize) -> Self; fn take_batch(&self, out: &mut Vec, max: usize); fn put_batch(&self, entries: &mut Vec); fn fill_batch(&self, out: &mut Vec, target: usize) { out.clear(); while out.len() < target { self.take_batch(out, target - out.len()); } } } struct WorkerState { shared: Arc, held: Vec, batch: usize, } impl WorkerState { fn new(shared: Arc, batch: usize) -> Self { let mut held = Vec::with_capacity(batch); shared.fill_batch(&mut held, batch); Self { shared, held, batch, } } #[inline] fn step(&mut self) { self.shared.put_batch(black_box(&mut self.held)); self.shared.fill_batch(&mut self.held, self.batch); } } impl Drop for WorkerState { fn drop(&mut self) { self.shared.put_batch(&mut self.held); } } pub fn bench(c: &mut Criterion) { let threadings = Threading::standard(); for &slots in SLOTS { for &threading in &threadings { for &batch in BATCH_SIZES { if batch > slots / threading.threads() { continue; } bench_case::(c, slots, threading, batch); bench_case::(c, slots, threading, batch); bench_case::(c, slots, threading, batch); } } } } fn bench_case( c: &mut Criterion, slots: usize, threading: Threading, batch: usize, ) { let name = bench_name::(slots, threading, batch); c.bench_function(&name, |b| { b.iter_custom(|iters| { let shared = Arc::new(S::with_capacity(slots, threading.threads())); measure( iters, threading, move || WorkerState::new(Arc::clone(&shared), batch), |state| state.step(), ) }) }); } fn bench_name( slots: usize, threading: Threading, batch: usize, ) -> String { let threads = threading.threads(); let mut name = format!( "{}/impl={} slots={slots} threads={threads} batch={batch}", module_path!(), S::as_str(), ); if let Threading::Multi { pattern, .. } = threading { name.push_str(&format!(" pattern={}", pattern.as_str())); } name } struct MutexVec(Mutex>); impl FreelistImplementation for MutexVec { fn as_str() -> &'static str { "mutex_vec" } fn with_capacity(capacity: usize, _parallelism: usize) -> Self { let slots = (0..capacity).map(Entry::new).collect(); Self(Mutex::new(slots)) } #[inline] fn take_batch(&self, out: &mut Vec, max: usize) { let mut slots = self.0.lock(); let count = max.min(slots.len()); let split = slots.len() - count; out.extend(slots.drain(split..)); } #[inline] fn put_batch(&self, slots: &mut Vec) { let mut inner = self.0.lock(); inner.extend(slots.drain(..)); } } impl Drop for MutexVec { fn drop(&mut self) { for entry in self.0.get_mut().drain(..) { // SAFETY: benchmark entries are allocated with `BENCH_LAYOUT`. unsafe { entry.buffer.deallocate(BENCH_LAYOUT) }; } } } struct ArrayQueueFreelist(ArrayQueue); impl FreelistImplementation for ArrayQueueFreelist { fn as_str() -> &'static str { "array_queue" } fn with_capacity(capacity: usize, _parallelism: usize) -> Self { let queue = ArrayQueue::new(capacity); for slot in 0..capacity { queue .push(Entry::new(slot)) .expect("array queue prefill must fit"); } Self(queue) } #[inline] fn take_batch(&self, out: &mut Vec, mut max: usize) { while max > 0 { let Some(entry) = self.0.pop() else { break; }; out.push(entry); max -= 1; } } #[inline] fn put_batch(&self, slots: &mut Vec) { for entry in slots.drain(..) { self.0 .push(entry) .expect("array queue push must fit in steady state"); } } } impl Drop for ArrayQueueFreelist { fn drop(&mut self) { while let Some(entry) = self.0.pop() { // SAFETY: benchmark entries are allocated with `BENCH_LAYOUT`. unsafe { entry.buffer.deallocate(BENCH_LAYOUT) }; } } } impl FreelistImplementation for Freelist { fn as_str() -> &'static str { "freelist" } fn with_capacity(capacity: usize, parallelism: usize) -> Self { Self::new( NonZeroU32::new(u32::try_from(capacity).expect("bench capacity must fit in u32")) .expect("bench capacity must be non-zero"), NonZeroUsize::new(parallelism).expect("bench parallelism must be non-zero"), BENCH_LAYOUT, true, ) } #[inline] fn take_batch(&self, out: &mut Vec, max: usize) { if max == 1 { if let Some((slot, buffer)) = self.take() { out.push(Entry { slot, buffer }); } return; } self.take_batch(max, |slot, buffer| { out.push(Entry { slot, buffer }); }); } #[inline] fn put_batch(&self, entries: &mut Vec) { if entries.len() == 1 { let entry = entries.pop().unwrap(); self.put(entry.slot, entry.buffer); return; } self.put_batch(entries.drain(..).map(|entry| (entry.slot, entry.buffer))); } }