//! End-to-end `BufferPool` allocation benchmarks.
//!
//! This module compares pooled allocation against direct aligned allocation for
//! the steady-state hot path we care about here: allocate, touch the requested
//! bytes at page granularity, and drop.
//!
//! # Metrics
//!
//! - **raw**: end-to-end time for allocate + page-touch + drop.
//! - **adjusted**: raw time minus the cost of repeatedly touching pages on an
//!   already-materialized buffer. This isolates allocator overhead. The
//!   baseline is always measured single-threaded because each thread writes to
//!   private memory, so the touch cost is the same per iteration regardless of
//!   thread count, and single-threaded measurement avoids scheduling noise that
//!   would swamp the subtraction signal.
//!
//! # Thread Configurations
//!
//! For each buffer size, the benchmark runs:
//!
//! - one single-threaded case
//! - one multi-threaded lockstep case
//! - one multi-threaded staggered case
//!
//! The shared [`Threading`] presets and timing harness come from [`super::utils`].
//!
//! # Why Touch Pages?
//!
//! Large allocations may be backed by lazily materialized virtual memory once
//! the allocator starts using `mmap`, so timing allocation alone can undercount
//! the real cost of actually using the buffer. Touching each page forces
//! materialization and makes the comparison between direct aligned allocation
//! and pooled reuse fairer.
//!
//! For large sizes this means much of the raw benchmark measures page writes
//! rather than allocator bookkeeping. That is acceptable because both
//! implementations pay the same page-touch cost, so the relative comparison
//! still isolates the allocation strategy.

use super::utils::{measure, Threading};
use commonware_runtime::{
    page_size, tokio, BufferPool, BufferPoolConfig, BufferPooler, IoBufMut, Runner as _,
};
use commonware_utils::{NZUsize, NZU32};
use criterion::Criterion;
use std::{hint::black_box, num::NonZeroUsize};
const SIZES: &[usize] = &[256, 1024, 4096, 65536, 1024 * 1024, 8 * 1024 * 1024];

#[derive(Clone, Copy)]
enum Metric {
    Raw,
    Adjusted,
}

impl Metric {
    const fn as_str(self) -> &'static str {
        match self {
            Self::Raw => "raw",
            Self::Adjusted => "adjusted",
        }
    }
}

#[derive(Clone, Copy)]
enum Mode {
    Direct,
    Pool,
}

impl Mode {
    const fn as_str(self) -> &'static str {
        match self {
            Self::Direct => "direct",
            Self::Pool => "pool",
        }
    }
}

pub fn bench(c: &mut Criterion) {
    let page_size = page_size();
    let threadings = Threading::standard();
    let threads = threadings
        .iter()
        .map(|threading| threading.threads())
        .max()
        .unwrap_or(1);

    for &size in SIZES {
        let pool = build_pool(size, threads);
        let alignment = pool.config().alignment.get();

        for threading in threadings {
            for metric in [Metric::Raw, Metric::Adjusted] {
                bench_case(
                    c,
                    Mode::Direct,
                    size,
                    threading,
                    metric,
                    || {
                        let mut buf =
                            IoBufMut::with_alignment(size, NonZeroUsize::new(alignment).unwrap());
                        touch_pages(buf.as_mut_ptr(), size, page_size);
                        buf
                    },
                    page_size,
                );
                bench_case(
                    c,
                    Mode::Pool,
                    size,
                    threading,
                    metric,
                    {
                        let pool = pool.clone();
                        move || {
                            let mut buf = pool
                                .try_alloc(size)
                                .expect("buffer pool exhausted during benchmark");
                            touch_pages(buf.as_mut_ptr(), size, page_size);
                            buf
                        }
                    },
                    page_size,
                );
            }
        }
    }
}

fn bench_case(
    c: &mut Criterion,
    mode: Mode,
    size: usize,
    threading: Threading,
    metric: Metric,
    work: impl Fn() -> IoBufMut + Sync,
    page_size: usize,
) {
    let name = bench_name(mode, metric, size, threading);
    c.bench_function(&name, |b| {
        b.iter_custom(|iters| {
            let full = measure(
                iters,
                threading,
                || {},
                |_| {
                    let buffer = black_box(work());
                    drop(buffer);
                },
            );

            if matches!(metric, Metric::Raw) {
                return full;
            }

            // Measure the cost of touching pages on a pre-allocated buffer.
            // Always single-threaded: each thread writes to private memory so
            // the per-iteration touch cost is the same regardless of thread
            // count, and single-threaded avoids wall-clock noise from thread
            // scheduling that would swamp the subtraction signal.
            let baseline = measure(iters, Threading::Single, &work, |buffer| {
                touch_pages(buffer.as_mut_ptr(), size, page_size)
            });

            full.saturating_sub(baseline)
        });
    });
}

#[inline]
fn touch_pages(ptr: *mut u8, size: usize, page_size: usize) {
    if size == 0 {
        return;
    }

    // Force the allocation to back each page before timing the drop path.
    // Otherwise large aligned allocations can look artificially cheap when the
    // allocator hands out lazily materialized virtual memory (i.e. `mmap`).
    //
    // We vary the write offset within each page so that consecutive pages hit
    // different L1 cache sets. A naive page-strided write (offset 0 on every
    // page) maps all stores to the same set because the page size is an exact
    // multiple of the L1 set count times the cache line size, causing
    // pathological eviction.
    const CACHE_LINE: usize = 128;
    let lines_per_page = page_size / CACHE_LINE;

    // SAFETY: `ptr` is valid for writes to `size` bytes.
    unsafe {
        for (i, offset) in (0..size).step_by(page_size).enumerate() {
            let within_page = (i % lines_per_page) * CACHE_LINE;
            let pos = offset + within_page;
            ptr.add(pos.min(size - 1)).write_volatile(0);
        }
        ptr.add(size - 1).write_volatile(0);
    }
}

fn bench_name(mode: Mode, metric: Metric, size: usize, threading: Threading) -> String {
    let threads = threading.threads();
    let mut name = format!(
        "{}/mode={} size={size} threads={threads} metric={}",
        module_path!(),
        mode.as_str(),
        metric.as_str(),
    );
    if let Threading::Multi { pattern, .. } = threading {
        name.push_str(&format!(" pattern={}", pattern.as_str()));
    }
    name
}

fn build_pool(size: usize, threads: usize) -> BufferPool {
    let max_per_class =
        u32::try_from(threads * 4).expect("bench capacity must fit in u32 slot ids");
    let cfg = BufferPoolConfig::for_network()
        .with_pool_min_size(1024)
        .with_min_size(NZUsize!(size.max(1024)))
        .with_max_size(NZUsize!(size.max(1024)))
        .with_max_per_class(NZU32!(max_per_class))
        .with_parallelism(NZUsize!(threads))
        .with_prefill(true);

    let runner_cfg = tokio::Config::default()
        .with_worker_threads(1)
        .with_network_buffer_pool_config(cfg);

    tokio::Runner::new(runner_cfg).start(|ctx| async move { ctx.network_buffer_pool().clone() })
}