diff --git a/barretenberg/cpp/CMakePresets.json b/barretenberg/cpp/CMakePresets.json index add51588b544..3d5b17a58d32 100644 --- a/barretenberg/cpp/CMakePresets.json +++ b/barretenberg/cpp/CMakePresets.json @@ -405,6 +405,7 @@ "generator": "Ninja", "toolchainFile": "cmake/toolchains/wasm32-wasi.cmake", "environment": { + "WASI_SDK_PREFIX": "/opt/wasi-sdk", "CC": "$env{WASI_SDK_PREFIX}/bin/clang", "CXX": "$env{WASI_SDK_PREFIX}/bin/clang++", "CXXFLAGS": "-DBB_VERBOSE -fvisibility=hidden", diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp index e5a0cee9c3d9..0837d349bc16 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/commitment_key.hpp @@ -11,13 +11,17 @@ */ #include "barretenberg/common/bb_bench.hpp" +#include "barretenberg/common/log.hpp" #include "barretenberg/common/ref_span.hpp" #include "barretenberg/ecc/scalar_multiplication/scalar_multiplication.hpp" #include "barretenberg/polynomials/polynomial.hpp" #include "barretenberg/srs/factories/crs_factory.hpp" #include "barretenberg/srs/global_crs.hpp" +#include +#include #include +#include #include #include #include diff --git a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp index 3292e36c020f..2545e951d586 100644 --- a/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp +++ b/barretenberg/cpp/src/barretenberg/commitment_schemes/ipa/ipa.hpp @@ -10,6 +10,7 @@ #include "barretenberg/common/assert.hpp" #include "barretenberg/common/bb_bench.hpp" #include "barretenberg/common/container.hpp" +#include "barretenberg/common/log.hpp" #include "barretenberg/common/thread.hpp" #include "barretenberg/common/throw_or_abort.hpp" #include "barretenberg/constants.hpp" @@ -19,6 +20,7 @@ #include "barretenberg/stdlib/primitives/circuit_builders/circuit_builders_fwd.hpp" #include "barretenberg/transcript/transcript.hpp" #include +#include #include #include #include diff --git a/barretenberg/cpp/src/barretenberg/common/thread.cpp b/barretenberg/cpp/src/barretenberg/common/thread.cpp index b3c01532321e..b2b98b0e1d04 100644 --- a/barretenberg/cpp/src/barretenberg/common/thread.cpp +++ b/barretenberg/cpp/src/barretenberg/common/thread.cpp @@ -81,9 +81,11 @@ namespace bb::detail { * for it to reach num_workers_ before returning, guaranteeing no worker is * still inside do_iterations() when the next generation is published. * - * Idle wait is yield-spin then 100 us sleep_for fallback. Neither path - * lowers to i32.atomic.wait, so the V8 wasi-threads lost-wakeup race that - * affects condition_variable-based pools does not apply here. + * Idle wait is yield-spin then 100 us sleep_for fallback on native. Browser WASM + * keeps yielding because std::this_thread::sleep_for lowers to WASI poll_oneoff, + * which is intentionally stubbed out in this build. Neither path lowers to + * i32.atomic.wait, so the V8 wasi-threads lost-wakeup race that affects + * condition_variable-based pools does not apply here. * * This is the same design as the round-parallel MSM's local pool — the MSM * dispatches parallel_for hundreds of times per proof, and per-call overhead @@ -151,7 +153,11 @@ class ParallelForPool { std::this_thread::yield(); } while (!pred()) { +#ifdef __wasm__ + std::this_thread::yield(); +#else std::this_thread::sleep_for(std::chrono::microseconds(100)); +#endif } } }; @@ -191,7 +197,11 @@ void ParallelForPool::worker_loop() ++idle_spins; std::this_thread::yield(); } else { +#ifdef __wasm__ + std::this_thread::yield(); +#else std::this_thread::sleep_for(std::chrono::microseconds(100)); +#endif } } } diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp new file mode 100644 index 000000000000..f601e4460c0a --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp @@ -0,0 +1,337 @@ +// Per-worker arena layout for the round-parallel Pippenger MSM (Zone W slab). +// +// Canonical source of truth for the per-worker byte walk that was previously +// duplicated across `compute_arena_bytes_for_msm`, the live allocator inside +// `pippenger_round_parallel`, and `pippenger_bn254_arena_layout_fits_for_test`. +// The historical arena drift bugs (cluster_offsets miscount, wasm +// aligned_local overflow, NO_GLV abort, t1 abort) all traced to disagreements +// between those copies; this struct removes that class by computing the layout +// once. +// +// The constructor's layout walk mirrors the live allocator's `layout_add` +// sequence exactly, including alignment slop. The sizer's previous +// arithmetic-only formula did not honour per-allocation alignment, so it +// systematically under-counted by a few bytes per slab; the struct fixes that +// by construction. +// +// Phase A and Stage 6 fields overlay the same per-worker bytes because the +// parallel_for invocations are disjoint (Phase A runs on the first window +// batch, Stage 6 runs per batch thereafter, and never on the same worker +// concurrently). `per_worker_union_bytes = max(ts_fixed, pa_layout)`. + +#pragma once + +#include "barretenberg/numeric/bitop/get_msb.hpp" + +#include +#include +#include +#include +#include +#include + +namespace bb::scalar_multiplication::round_parallel_detail { + +// ============================================================================ +// Round-parallel internals exposed to the test suite. +// +// `pippenger_bn254_arena_layout_fits_for_test` is a TU-local helper that walks +// the actual Zone P / Zone W / Zone S allocator for representative inputs and +// asserts the result fits in `compute_arena_bytes_for_msm`'s promise. Its body +// lives in `scalar_multiplication.test.cpp`, which means the helpers it needs +// (`choose_window_bits`, `build_var_window_schedule`, `ChunkOutput`, +// `DEDUP_MAX_*`, `VAR_WINDOW_MAX_WINDOWS`, `compute_arena_bytes_for_msm`) need +// header-visible declarations. +// ============================================================================ + +// Per-window count cap shared by `VariableWindowSchedule` arrays and the live +// allocator's `window_sums_storage` slot. +inline constexpr size_t VAR_WINDOW_MAX_WINDOWS = 128; + +// Dedup pre-pass caps. DEDUP_MAX_CLUSTERS bounds `extra_points` at ≤ 1 MB; +// DEDUP_MAX_MEMBERS bounds the per-worker `cluster_members` slab. +inline constexpr size_t DEDUP_MAX_CLUSTERS = 16384; +inline constexpr size_t DEDUP_MAX_MEMBERS = 32768; + +// Uniform window schedule produced by `build_var_window_schedule`. Holds the +// per-window `c` value and bucket count for downstream sizing/dispatch. +struct VariableWindowSchedule { + size_t num_windows = 0; + std::array window_bits_per_window{}; // window_bits_w for each w + std::array bit_base{}; // B_w = Σ_{k num_buckets{}; // 2^(window_bits_w - 1) + 1 +}; + +// Per-chunk recursive-affine bucket-reduce output (Stage 6b output cell). +template struct ChunkOutput { + typename Curve::Element R{}; + typename Curve::Element L{}; + uint32_t lo = 0; + uint32_t hi = 0; + uint8_t empty = 1; +}; + +// Pick the optimal window size `c`. Native uses a cost model +// `rounds * (n + 15 * buckets)`; WASM uses a closed-form `target_load` formula. +[[nodiscard]] inline uint32_t choose_window_bits( + size_t num_points, size_t num_bits, size_t n_input, size_t num_logical_threads, bool use_rebalance) noexcept +{ + constexpr uint32_t MAX_C = 20; + uint32_t best = 2; + +#ifdef __wasm__ + static_cast(num_bits); + static_cast(use_rebalance); + const size_t target_load = (n_input > 4096) ? (num_logical_threads * 2 / 3) : (num_logical_threads / 3); + if (target_load == 0 || num_points <= target_load) { + best = 2; + } else { + const size_t ratio = num_points / target_load; + const uint32_t lg = static_cast(numeric::get_msb(ratio)); + best = lg + 1; + if (best < 2) { + best = 2; + } else if (best >= MAX_C) { + best = MAX_C - 1; + } + } +#else + static_cast(n_input); + static_cast(num_logical_threads); + static_cast(use_rebalance); + uint64_t best_cost = static_cast(-1); + for (uint32_t window_bits = 2; window_bits < MAX_C; ++window_bits) { + const uint64_t rounds = (num_bits + 2 + window_bits - 1) / window_bits; + const uint64_t buckets = (uint64_t{ 1 } << (window_bits - 1)) + 1; + const uint64_t n = num_points; + constexpr uint64_t BUCKET_ACC_COST = 15; + const uint64_t cost = rounds * (n + (buckets * BUCKET_ACC_COST)); + if (cost < best_cost) { + best_cost = cost; + best = window_bits; + } + } +#endif + + return best; +} + +// Build a uniform window schedule for the given bit budget and chosen `c`. +inline VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t window_bits_unsplit) noexcept +{ + VariableWindowSchedule sched{}; + + auto fill_windows = [&](size_t bits_to_cover, size_t window_bits_default, size_t out_offset) -> size_t { + size_t bits_remaining = bits_to_cover; + size_t w = out_offset; + size_t bit_offset = (w == 0) ? 0 : sched.bit_base[w - 1] + sched.window_bits_per_window[w - 1]; + while (bits_remaining > 0) { + const size_t window_bits_w = std::min(window_bits_default, bits_remaining); + sched.bit_base[w] = static_cast(bit_offset); + sched.window_bits_per_window[w] = static_cast(window_bits_w); + sched.num_buckets[w] = static_cast((size_t{ 1 } << (window_bits_w - 1)) + 1); + bit_offset += window_bits_w; + bits_remaining -= window_bits_w; + ++w; + if (w >= VAR_WINDOW_MAX_WINDOWS) { + break; + } + } + return w - out_offset; + }; + + const size_t total_bits = num_bits + 2; + sched.num_windows = fill_windows(total_bits, window_bits_unsplit, /*out_offset=*/0); + return sched; +} + +// Maximum number of independent additions batched per modular inversion in the +// affine-arithmetic group ops (used by Stage 6a/6b). Sizes per-worker +// `points_to_add`, `inversion_scratch`, and `pair_dest` arrays. +inline constexpr size_t BATCH_CAPACITY = 256; + +// Phase A's chunked tree-reduce limit. Capped so the per-worker scratch slab +// (chunk_pts + chunk_ids) stays under ~128 KB. +inline constexpr size_t DEDUP_MAX_CHUNK_MEMBERS = 2048; + +// Per-bucket-chunk metadata produced by Stage 6a, consumed by Stage 6b's +// cross-thread reduce. +// lo, hi — lowest / highest non-empty digit in the chunk (inclusive) +// buckets_padded — next power of two ≥ (hi - lo + 1) +// empty — 1 iff the chunk had no entries (Stage 6b skips it) +struct AffineBucketChunkInfo { + uint32_t lo = 0; + uint32_t hi = 0; + uint32_t buckets_padded = 0; + uint8_t empty = 1; +}; + +template struct PerWorkerArenaLayout { + using AffineElement = typename Curve::AffineElement; + using BaseField = typename Curve::BaseField; + + // Caps shared between sizer and allocator. Centralised here so the two + // sites can't diverge. + static constexpr size_t PHASE_A_DIRTY_SLOTS_CAP = 4096; // HT_SIZE + static constexpr size_t PHASE_A_BUCKET_REP_CAP = 256; // loose cap + static constexpr size_t PHASE_A_STAGED_CAP = 1024; // loose cap + static constexpr size_t PHASE_A_CHUNK_CAP = DEDUP_MAX_CHUNK_MEMBERS; + static constexpr size_t WORKER_SLAB_ALIGN = alignof(AffineElement); + + // Computed byte sizes (filled by constructor's layout walk). + size_t ts_fixed_layout = 0; // ThreadScratch wpb-independent fields, with align slop + size_t pa_layout = 0; // PhaseAScratch fields, with align slop + size_t per_worker_union_bytes = 0; // = align_up(max(ts_fixed_layout, pa_layout), WORKER_SLAB_ALIGN) + size_t per_worker_per_wpb_layout = 0; // Stage 6 wpb-dependent tail + size_t per_worker_bytes = 0; // = align_up(union + tail, WORKER_SLAB_ALIGN) + + // Constructor performs the canonical layout walk. `windows_per_batch` and + // `dense_stride_est` may be zero — only the wpb-independent parts then + // have meaningful values, useful for the sizer's pre-wpb-solve step. + PerWorkerArenaLayout(size_t chunk_capacity, + size_t global_max_overflow_per_window, + bool dedup_active, + size_t phase_a_cluster_members_cap, + size_t phase_a_cluster_offsets_cap, + size_t windows_per_batch, + size_t dense_stride_est) noexcept + { + auto align_up = [](size_t off, size_t align) -> size_t { return (off + align - 1) & ~(align - 1); }; + auto layout_add = [&](size_t& off, size_t bytes, size_t align) { off = align_up(off, align) + bytes; }; + + // ThreadScratch fixed (curr_pts / curr_buckets / points_to_add / + // inversion_scratch / pair_dest / overflow_slots / overflow_pts). + layout_add(ts_fixed_layout, sizeof(AffineElement) * chunk_capacity, alignof(AffineElement)); + layout_add(ts_fixed_layout, sizeof(uint32_t) * chunk_capacity, alignof(uint32_t)); + layout_add(ts_fixed_layout, sizeof(AffineElement) * 2 * BATCH_CAPACITY, alignof(AffineElement)); + layout_add(ts_fixed_layout, sizeof(BaseField) * BATCH_CAPACITY, alignof(BaseField)); + layout_add(ts_fixed_layout, sizeof(uint32_t) * BATCH_CAPACITY, alignof(uint32_t)); + layout_add(ts_fixed_layout, sizeof(uint32_t) * global_max_overflow_per_window, alignof(uint32_t)); + layout_add(ts_fixed_layout, sizeof(AffineElement) * global_max_overflow_per_window, alignof(AffineElement)); + + // PhaseA (cluster_members / cluster_offsets / dirty_slots / bucket_rep + // / staged / chunk_pts / chunk_ids). Only allocated when dedup_active. + if (dedup_active) { + layout_add(pa_layout, sizeof(uint32_t) * phase_a_cluster_members_cap, alignof(uint32_t)); + layout_add(pa_layout, sizeof(uint32_t) * phase_a_cluster_offsets_cap, alignof(uint32_t)); + layout_add(pa_layout, sizeof(uint16_t) * PHASE_A_DIRTY_SLOTS_CAP, alignof(uint16_t)); + layout_add(pa_layout, sizeof(uint32_t) * PHASE_A_BUCKET_REP_CAP, alignof(uint32_t)); + layout_add(pa_layout, + sizeof(std::pair) * PHASE_A_STAGED_CAP, + alignof(std::pair)); + layout_add(pa_layout, sizeof(AffineElement) * PHASE_A_CHUNK_CAP, alignof(AffineElement)); + layout_add(pa_layout, sizeof(uint32_t) * PHASE_A_CHUNK_CAP, alignof(uint32_t)); + } + + per_worker_union_bytes = align_up(std::max(ts_fixed_layout, pa_layout), WORKER_SLAB_ALIGN); + + // Stage 6 wpb-dependent tail (dense_buckets / is_present / pair + // scratch / chunk_infos). Skipped when windows_per_batch == 0 (sizer's + // pre-wpb-solve call). + if (windows_per_batch != 0) { + const size_t dense_total = windows_per_batch * dense_stride_est; + const size_t dense_pair_max = dense_total / 2; + layout_add(per_worker_per_wpb_layout, sizeof(AffineElement) * dense_total, alignof(AffineElement)); + layout_add(per_worker_per_wpb_layout, sizeof(uint8_t) * dense_total, alignof(uint8_t)); + layout_add(per_worker_per_wpb_layout, + sizeof(std::pair) * dense_pair_max, + alignof(std::pair)); + layout_add(per_worker_per_wpb_layout, sizeof(uint32_t) * dense_pair_max, alignof(uint32_t)); + layout_add(per_worker_per_wpb_layout, sizeof(BaseField) * dense_pair_max, alignof(BaseField)); + layout_add(per_worker_per_wpb_layout, + sizeof(AffineBucketChunkInfo) * windows_per_batch, + alignof(AffineBucketChunkInfo)); + } + + per_worker_bytes = align_up(per_worker_union_bytes + per_worker_per_wpb_layout, WORKER_SLAB_ALIGN); + } +}; + +// Stride upper bound for `s.dense_buckets`: next_pow2(⌈(B-1)/T⌉), with a floor of 2. +[[nodiscard]] inline size_t compute_dense_stride(size_t B_eff, size_t num_threads) noexcept +{ + const size_t per_thread = (B_eff > 1) ? ((B_eff - 1 + num_threads - 1) / num_threads) : size_t{ 1 }; + return std::max(2, std::bit_ceil(per_thread)); +} + +// Upper bound on Σ_t buckets_per_thread[t][w] per window: B + T - 1 (adjacent threads +// may share one boundary bucket). Returns 0 when B_eff == 0. +[[nodiscard]] inline size_t compute_bucket_partials_max(size_t B_eff, size_t num_threads) noexcept +{ + return (B_eff > 0) ? (B_eff - 1 + num_threads - 1) : size_t{ 0 }; +} + +// Per-OS-thread Stage 6a seam overflow capacity (per-window upper bound). +[[nodiscard]] inline size_t compute_global_max_overflow_per_window(size_t n, + size_t num_threads, + size_t subchunk_entries_cap) noexcept +{ + const size_t global_max_chunk_len = (n + num_threads - 1) / num_threads; + return (global_max_chunk_len + subchunk_entries_cap - 1) / subchunk_entries_cap; +} + +// Per-window byte cost for one window in a windows-per-batch slab. Identical formula +// at three sites (sizer outer, sizer per-schedule lambda, live allocator); centralised +// here so they cannot drift. +// +// schedule = 4·n +// HIST slot = max(4·t·B, sizeof(ChunkOutput)·t + 96·t) [H ∪ O overlay] +// DENSE slot = 65 · bucket_partials_max(B, t) [bucket_partials_dense + present] +// bucket_start = 8·(B+1) +// chunk arrays = 8·(t+1) + 8·(t+1) + 8·t + 8·t + 8·t + 16·worker + 8·t +// dense_buckets = 87·worker·stride [s.dense_buckets + aux] +template +[[nodiscard]] inline size_t compute_per_window_bytes( + size_t num_threads, size_t B_eff, size_t n, size_t dense_stride, size_t worker_total) noexcept +{ + const size_t bucket_partials_max = compute_bucket_partials_max(B_eff, num_threads); + const size_t hist_h_bytes_pw = size_t{ 4 } * num_threads * B_eff; + const size_t hist_o_bytes_pw = (sizeof(ChunkOutput) * num_threads) + (size_t{ 96 } * num_threads); + const size_t hist_slot_bytes_pw = std::max(hist_h_bytes_pw, hist_o_bytes_pw); + const size_t dense_slot_bytes_pw = size_t{ 65 } * bucket_partials_max; + return (size_t{ 4 } * n) + hist_slot_bytes_pw + dense_slot_bytes_pw + (size_t{ 8 } * (B_eff + 1)) + + (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * num_threads) + + (size_t{ 8 } * num_threads) + (size_t{ 8 } * num_threads) + (size_t{ 16 } * worker_total) + + (size_t{ 8 } * num_threads) + (size_t{ 87 } * worker_total * dense_stride); +} + +// Phase-1 prologue bytes living in the per-MSM arena (msb_per_scalar, glv_scalars, +// glv_points, per_thread_msb_hist). Two-copy duplicate eliminated. +[[nodiscard]] inline size_t compute_phase_one_prologue_bytes(size_t n, + bool use_glv, + bool inline_glv_double, + size_t profile_threads) noexcept +{ + return n // msb_per_scalar + + (use_glv ? size_t{ 32 } * n : size_t{ 0 }) // glv_scalars_storage + + (inline_glv_double ? size_t{ 64 } * n : size_t{ 0 }) // glv_points_storage + + (profile_threads * size_t{ 1024 }); // per_thread_msb_hist +} + +struct PhaseACaps { + size_t members_cap; + size_t offsets_cap; +}; + +// Phase A per-worker caps. `members_cap = min(DEDUP_MAX_MEMBERS, n)` is tight (each +// scalar contributes ≤ 1 cluster_member entry). `offsets_cap = cids_per_thread + 2` +// covers the leading-zero sentinel + post-last terminator. +[[nodiscard]] inline PhaseACaps compute_phase_a_caps(size_t n, size_t num_threads) noexcept +{ + return { std::min(DEDUP_MAX_MEMBERS, n), (DEDUP_MAX_CLUSTERS / num_threads) + 2 }; +} + +// Solve `wpb · per_window_bytes ≤ available_budget`, clamped to W_R and ≥ 1. +// Mirrors the three identical wpb-pickers in the sizer and live allocator. +[[nodiscard]] inline size_t solve_wpb(size_t per_window_bytes, size_t available_budget, size_t W_R) noexcept +{ + if (W_R == 0) { + return 1; + } + if (per_window_bytes == 0 || available_budget == 0) { + return std::max(1, W_R); + } + return std::min(std::max(1, available_budget / per_window_bytes), W_R); +} + +} // namespace bb::scalar_multiplication::round_parallel_detail diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp new file mode 100644 index 000000000000..d14f4404ec51 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_constantine.hpp @@ -0,0 +1,390 @@ +// Constantine-style signed-Booth window recoder for the round-parallel Pippenger MSM. +// +// Implements the carry-less `signedWindowEncoding` / `getSignedFullWindowAt` pattern from +// `constantine/math/arithmetic/bigints.nim`: each window reads c+1 bits including the +// previous window's top bit, lets that shared boundary bit substitute for an explicit +// carry, and produces a `(sign | bucket)` packed digit. Stage 1 and Stage 4 of the +// pipeline call into here on the hot path. +// +// Two parallel families live in this file: +// * scalar path — `ConstantineSliceParams` + `get_constantine_packed_digit` (uint64- +// indexed limbs). +// * SIMD x4 path — `ConstantineSliceParamsU32` + `store_constantine_packed_digits_x4_*` +// (uint32-indexed limbs, processes 4 scalars per call via GCC vector_size). +// +// The SIMD helpers split on slice-path (Localised / Bottom / Boundary) so the per-window +// branch is hoisted out of the per-scalar loop. `classify_slice_path_u32` returns the +// matching enum; Stage 1 / Stage 4 dispatch on it once per window. + +#pragma once + +#include +#include + +#ifdef __wasm_simd128__ +#include +#endif + +namespace bb::scalar_multiplication::round_parallel_detail { + +/** + * @brief Per-window precomputed slice parameters for the carry-less signed-Booth window + * recoding (after Constantine `signedWindowEncoding` / `getSignedFullWindowAt`, + * `constantine/math/arithmetic/bigints.nim`). Computed once per window outside the + * Stage 1 / Stage 4 inner loops; the per-(point, window) hot path is then 2 i32 + * loads + a fixed bit-twiddle, no branches and no per-iter address arithmetic. + * Carry-less because every non-bottom window's c+1-bit read shares its boundary bit + * with the previous window — the bit a non-overlapping recoder would carry. + * + * `slice_localised_to_one_u64`: true iff every bit of the c+1-bit window lives inside a + * single uint64 limb. ~75% of windows on typical 254-bit scalars with c ∈ [12, 18] + * (lookback bits at non-boundary positions) hit this and take the fast path: one load, + * one shift, one mask. The slow path is the boundary-straddling case + the synthetic- + * lookback bottom window. + */ +struct ConstantineSliceParams { + uint32_t lo_mask; + uint32_t hi_mask; + uint32_t lo_limb; + uint32_t hi_limb; // == lo_limb + 1, except clamped to last valid limb at the top window + uint32_t lo_off; + uint32_t lo_bits; + bool slice_localised_to_one_u64; +}; + +/** + * @brief Compute the Constantine slice params for a window starting at absolute bit position + * `bit_offset` (= Σ_{k(lookback_bit / LIMB_BITS); + sp.lo_off = static_cast(lookback_bit & (LIMB_BITS - 1)); + sp.lo_bits = static_cast(LIMB_BITS - sp.lo_off < bits_to_read ? LIMB_BITS - sp.lo_off : bits_to_read); + const uint32_t hi_bits = static_cast(bits_to_read) - sp.lo_bits; + // window_bits+1 ≤ 32 for our windows ⇒ lo_bits ≤ 32 ⇒ mask fits in uint32. + sp.lo_mask = (uint32_t{ 1 } << sp.lo_bits) - 1; + // If the natural hi-limb read would land past the end of the scalar's storage, + // clamp `hi_limb` to a safe in-range index and mask its contribution to zero. The + // top window's hi_bits worth of bits are conceptually zero (scalar < 2^num_bits ≤ + // num_windows·window_bits). Re-reading lo_limb under a zero mask keeps the slow + // path's two unconditional limb loads branch-free. + if (static_cast(sp.lo_limb) + 1 >= num_uint64_limbs) { + sp.hi_limb = sp.lo_limb; + sp.hi_mask = 0; + } else { + sp.hi_limb = sp.lo_limb + 1; + sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1; + } + // Fast path: the full (window_bits+1)-bit window lives inside `lo_limb`. hi_bits == 0 + // captures both the in-limb case (window doesn't straddle a 64-bit boundary) and the + // clamped top-window case (above) where hi_mask was forced to 0. + sp.slice_localised_to_one_u64 = (hi_bits == 0); + } + return sp; +} + +/** + * @brief Read (window_bits+1) bits from `scalar_data` (uint64 limbs) using precomputed + * slice params and apply Constantine's signedWindowEncoding to produce a + * `(sign | bucket)` packed digit. Inner-loop body for Stage 1 / Stage 4 — + * fully inlined. + * + * Takes the slice params as scalar value parameters rather than a struct reference + * so the compiler reliably holds them in registers across the inner loop. (Passing + * a const-ref to a small struct sometimes blocks the same hoisting an explicit + * unpack-then-pass guarantees; we saw exactly this regression with the variable-c + * split params before unpacking.) + * + * `slice_localised_to_one_u64` selects the single-load fast path: ~75% of windows + * on typical 254-bit scalars (window_bits ∈ [12, 18]) hit this. Because the slice + * params are loop-invariant within a window, the branch resolves once per inner- + * loop iter and the inner branch predictor pins it. + */ +[[nodiscard]] [[gnu::always_inline]] inline uint32_t get_constantine_packed_digit(const uint64_t* scalar_data, + uint32_t lo_limb, + uint32_t hi_limb, + uint32_t lo_off, + uint32_t lo_bits, + uint32_t lo_mask, + uint32_t hi_mask, + bool slice_localised_to_one_u64, + size_t window_bits) noexcept +{ + uint64_t raw_wide = 0; + if (slice_localised_to_one_u64) { + // Fast path: one load + shift + mask. hi_part vanishes (hi_mask == 0); skip it. + raw_wide = (scalar_data[lo_limb] >> lo_off) & lo_mask; + } else if (lo_mask == 0) { + // Bottom-window fast path: synthetic-zero lookback bit, so the lo_part contribution is + // always 0 (lo_mask == 0). Skip the lo limb load entirely. lo_bits == 1 here, so the + // shift plants the window_bits-bit slice at bits 1..window_bits with bit 0 = 0. + // sp_lo_mask is loop-invariant within a window but is a runtime stack value, so the + // compiler does NOT constant-fold the `(s_lo >> lo_off) & 0 = 0` path inside the + // boundary branch; this explicit check saves ~3 ALU ops per scalar on the bottom window. + raw_wide = (scalar_data[hi_limb] & hi_mask) << lo_bits; + } else { + // Slow path: window straddles a uint64 boundary. + const uint64_t s_lo = scalar_data[lo_limb]; + const uint64_t s_hi = scalar_data[hi_limb]; + const uint64_t lo_part = (s_lo >> lo_off) & lo_mask; + const uint64_t hi_part = (s_hi & hi_mask) << lo_bits; + raw_wide = lo_part | hi_part; + } + // raw fits in window_bits+1 ≤ 32 bits, safe to narrow. + const uint32_t raw = static_cast(raw_wide); + + // signedWindowEncoding(raw, window_bits). raw fits in window_bits+1 bits; bit + // `window_bits` is the sign indicator. + // + // The conditional-negate trick `((encode + neg_mask) ^ neg_mask)` is the standard + // branchless idiom. We use the equivalent `(encode - neg) ^ neg_mask` to break the + // latency chain: `encode - neg` and `neg_mask = -neg` can issue in parallel (both + // depend only on `neg` / `encode`), whereas `encode + neg_mask` first waits for + // `neg_mask` to materialise. Saves one cycle on the inner-loop critical path + // (neg → neg_mask → +neg_mask → ^neg_mask → &val_mask vs neg → {neg_mask, enc_neg} + // in parallel → ^neg_mask → &val_mask). Identical result by: + // neg=0: enc_neg = encode, xored = encode ^ 0 = encode. ✓ + // neg=1: enc_neg = encode−1, xored = (encode−1) ^ −1 = ~(encode−1) = −encode. ✓ + const uint32_t neg = (raw >> window_bits) & uint32_t{ 1 }; + const uint32_t neg_mask = uint32_t{ 0 } - neg; // 0 or 0xFFFFFFFF + const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1; + const uint32_t encode = (raw + 1) >> 1; + const uint32_t bucket_idx = ((encode - neg) ^ neg_mask) & val_mask; + + // Pack into (sign | bucket). Stage 1 uses the bucket bits for histograms; Stage 4 + // stores only the sign bit because Stage 6 recovers bucket magnitude from bucket_start. + return (neg << 31) | bucket_idx; +} + +// 128-bit SIMD-friendly 4-wide variant of get_constantine_packed_digit. Computes 4 packed +// digits in parallel via GCC's vector_size extension, which lowers to native SIMD on x86 +// (SSE2), ARM (NEON), and WASM (wasm-simd128). The branch on slice path is hoisted from +// the per-call site to the per-window outer loop, so Stage 1 / Stage 4 callers select the +// localised / bottom / boundary specialisation once per window. +// +// We index the scalar via a `const uint32_t*` view rather than the natural `uint64_t*`: +// each lane is one uint32, so a 128-bit SIMD register holds 4 (raw, encode, bucket, …) +// values. `scalar.data` is a `std::array` whose byte layout is identical to +// `uint32_t[8]` on every target we ship to (x86 / ARM / WASM are all little-endian, and the +// codebase already assumes this layout in many places — `from_montgomery`, `uint256_t`, +// etc.). The reinterpret_cast is the same alias pattern. +// +// Returns the four packed digits in `out[0..3]`. The caller scatters them to the histogram +// (Stage 1) or schedule (Stage 4) individually, since the consuming write is a +// non-vectorisable scatter. Switching from 2-wide uint64 to 4-wide uint32 doubles the +// compute throughput per SIMD instruction at the cost of slightly more straddle hits (the +// "localised" fast-path rate drops from ~77 % to ~50 % at c=14), but compute dominates +// per-iter cost so the net win is positive. +using SimdU32x4 = uint32_t __attribute__((vector_size(16))); + +// Helpers return `SimdU32x4` directly so the v128 stays in the SIMD register file end-to-end. +// Wrapping in a 4-uint32 struct round-tripped the v128 through 4 scalar memory slots. + +// uint32-indexed Constantine slice params, mirroring `ConstantineSliceParams` but with +// limb indices measured in 32-bit (rather than 64-bit) chunks. Computed once per window in +// `compute_constantine_slice_params_u32`; consumed by the SIMD x4 helpers below. +struct ConstantineSliceParamsU32 { + uint32_t lo_mask; + uint32_t hi_mask; + uint32_t lo_limb; // u32 limb index of the lookback bit + uint32_t hi_limb; // == lo_limb + 1, clamped to last in-range u32 limb at the top window + uint32_t lo_off; // bit-offset of the lookback bit within `lo_limb` + uint32_t lo_bits; // # bits read from `lo_limb` (also acts as the hi_part left-shift amount) + bool slice_localised_to_one_u32; + bool is_bottom_window; +}; + +[[nodiscard]] inline ConstantineSliceParamsU32 compute_constantine_slice_params_u32(size_t bit_offset, + size_t window_bits, + size_t num_u32_limbs) noexcept +{ + constexpr size_t LIMB_BITS_U32 = 32; + ConstantineSliceParamsU32 sp; + if (bit_offset == 0) { + sp.lo_limb = 0; + sp.hi_limb = 0; + sp.lo_off = LIMB_BITS_U32 - 1; + sp.lo_bits = 1; + sp.lo_mask = 0; + sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1; + sp.slice_localised_to_one_u32 = false; + sp.is_bottom_window = true; + } else { + const size_t lookback_bit = bit_offset - 1; + const size_t bits_to_read = window_bits + 1; + sp.lo_limb = static_cast(lookback_bit / LIMB_BITS_U32); + sp.lo_off = static_cast(lookback_bit & (LIMB_BITS_U32 - 1)); + const uint32_t in_lo = static_cast(LIMB_BITS_U32 - sp.lo_off); + sp.lo_bits = (in_lo < static_cast(bits_to_read)) ? in_lo : static_cast(bits_to_read); + const uint32_t hi_bits = static_cast(bits_to_read) - sp.lo_bits; + sp.lo_mask = (sp.lo_bits == LIMB_BITS_U32) ? ~uint32_t{ 0 } : ((uint32_t{ 1 } << sp.lo_bits) - 1); + if (static_cast(sp.lo_limb) + 1 >= num_u32_limbs) { + sp.hi_limb = sp.lo_limb; + sp.hi_mask = 0; + } else { + sp.hi_limb = sp.lo_limb + 1; + sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1; + } + sp.slice_localised_to_one_u32 = (hi_bits == 0); + sp.is_bottom_window = false; + } + return sp; +} + +// Gather 4 disjoint uint32 values into one v128 via wasm v128.load32_lane. On WASM this +// is 1 splat + 3 load32_lane (4 ops); brace-init `{a, b, c, d}` with runtime values emits +// 4 scalar i32.load + 1 splat + 3 replace_lane (8 ops). On native it falls back to brace- +// init which clang lowers to NEON ins / SSE2 pinsrd. +[[nodiscard]] [[gnu::always_inline]] inline SimdU32x4 gather_x4_u32( + const uint32_t* p0, const uint32_t* p1, const uint32_t* p2, const uint32_t* p3, uint32_t idx) noexcept +{ +#ifdef __wasm_simd128__ + v128_t v = wasm_i32x4_splat(0); + v = wasm_v128_load32_lane(p0 + idx, v, 0); + v = wasm_v128_load32_lane(p1 + idx, v, 1); + v = wasm_v128_load32_lane(p2 + idx, v, 2); + v = wasm_v128_load32_lane(p3 + idx, v, 3); + return reinterpret_cast(v); +#else + return SimdU32x4{ p0[idx], p1[idx], p2[idx], p3[idx] }; +#endif +} + +// All four mask / constant v128s (lo_mask_v, hi_mask_v, one_v, val_mask) are loop-invariant +// within a window. Callers build them ONCE per window in the outer-w loop and pass them in, +// so the inner-i compute loop has zero v128.const / splat / shl+sub for the masks. +// `neg_mask = -neg` uses GCC vector-ext unary minus which lowers to `i32x4.neg` on WASM. +// +// Helpers write the v128 result DIRECTLY into the caller's stack buffer via an aligned +// `v128.store` (or equivalent on native). No return-by-value, no temporary, no memcpy — +// the v128 register flows from the bit-pack pipeline straight into the destination buffer. +[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_localised(uint32_t* dst, + const uint32_t* scalar_data_0, + const uint32_t* scalar_data_1, + const uint32_t* scalar_data_2, + const uint32_t* scalar_data_3, + uint32_t lo_limb, + uint32_t lo_off, + SimdU32x4 lo_mask_v, + SimdU32x4 one_v, + SimdU32x4 val_mask, + uint32_t window_bits) noexcept +{ + const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb); + const SimdU32x4 raw = (lo >> lo_off) & lo_mask_v; + const SimdU32x4 neg = (raw >> window_bits) & one_v; + const SimdU32x4 neg_mask = -neg; + const SimdU32x4 encode = (raw + one_v) >> 1; + const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; + const SimdU32x4 packed = (neg << 31) | bucket; +#ifdef __wasm_simd128__ + wasm_v128_store(dst, reinterpret_cast(packed)); +#else + *reinterpret_cast(dst) = packed; +#endif +} + +[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_bottom(uint32_t* dst, + const uint32_t* scalar_data_0, + const uint32_t* scalar_data_1, + const uint32_t* scalar_data_2, + const uint32_t* scalar_data_3, + uint32_t hi_limb, + uint32_t lo_bits, + SimdU32x4 hi_mask_v, + SimdU32x4 one_v, + SimdU32x4 val_mask, + uint32_t window_bits) noexcept +{ + const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb); + const SimdU32x4 raw = (hi & hi_mask_v) << lo_bits; + const SimdU32x4 neg = (raw >> window_bits) & one_v; + const SimdU32x4 neg_mask = -neg; + const SimdU32x4 encode = (raw + one_v) >> 1; + const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; + const SimdU32x4 packed = (neg << 31) | bucket; +#ifdef __wasm_simd128__ + wasm_v128_store(dst, reinterpret_cast(packed)); +#else + *reinterpret_cast(dst) = packed; +#endif +} + +[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_boundary(uint32_t* dst, + const uint32_t* scalar_data_0, + const uint32_t* scalar_data_1, + const uint32_t* scalar_data_2, + const uint32_t* scalar_data_3, + uint32_t lo_limb, + uint32_t hi_limb, + uint32_t lo_off, + uint32_t lo_bits, + SimdU32x4 lo_mask_v, + SimdU32x4 hi_mask_v, + SimdU32x4 one_v, + SimdU32x4 val_mask, + uint32_t window_bits) noexcept +{ + const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb); + const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb); + const SimdU32x4 lo_part = (lo >> lo_off) & lo_mask_v; + const SimdU32x4 hi_part = (hi & hi_mask_v) << lo_bits; + const SimdU32x4 raw = lo_part | hi_part; + const SimdU32x4 neg = (raw >> window_bits) & one_v; + const SimdU32x4 neg_mask = -neg; + const SimdU32x4 encode = (raw + one_v) >> 1; + const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; + const SimdU32x4 packed = (neg << 31) | bucket; +#ifdef __wasm_simd128__ + wasm_v128_store(dst, reinterpret_cast(packed)); +#else + *reinterpret_cast(dst) = packed; +#endif +} + +// Path-selector enum (used by Stage 1 / Stage 4 to dispatch on the SIMD specialisation +// once per window rather than once per scalar). +enum class ConstantineSlicePath : uint8_t { + Localised = 0, + Bottom = 1, + Boundary = 2, +}; + +[[nodiscard]] [[gnu::always_inline]] inline ConstantineSlicePath classify_slice_path_u32( + const ConstantineSliceParamsU32& sp) noexcept +{ + if (sp.is_bottom_window) { + return ConstantineSlicePath::Bottom; + } + if (sp.slice_localised_to_one_u32) { + return ConstantineSlicePath::Localised; + } + return ConstantineSlicePath::Boundary; +} + +} // namespace bb::scalar_multiplication::round_parallel_detail diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_rewrite_review_map.md b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_rewrite_review_map.md new file mode 100644 index 000000000000..076bc6525ef9 --- /dev/null +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/pippenger_rewrite_review_map.md @@ -0,0 +1,519 @@ +# Pippenger Rewrite Review Map + +This is a reviewer-oriented map of the current Pippenger rewrite stack. It groups the +optimizations by the inefficiency they are trying to exploit, the heuristic or predicate +that activates them, and the specific risks worth reviewing before treating the rewrite as +production-ready. + +## Current Status + +The stack has been rebased after Bernstein-Yang inversion landed separately in +`merge-train/barretenberg` as PR #23426. Treat Bernstein-Yang as a baseline dependency for +this review, not as part of the remaining Pippenger PR diff. When older measurements below +attribute some speedup to "Bernstein-Yang + staged Pippenger", read that as evidence that +the no-dedup path is fast; the currently reviewable Pippenger delta is the staged MSM, +recoding, batching, GLV/dedup plumbing, arena, and thread-pool changes. + +Current branch status: + +- Variable-window split is removed from the production path. +- The dedup cluster-publication bug that broke `ChonkTests.TestCircuitSizes` is fixed by + publishing only flattened clusters. +- The original Chonk/wasm/no-GLV arena-overflow reproductions have been rerun successfully + on the current branch: transfer_1 native, transfer_0 wasm, transfer_0 native with + `BB_MSM_NO_GLV=1`, and the dedup cap fallback assertion. +- New small and large arena regressions exposed a separate sizing drift: the pre-Phase-1 + arena sizer used the full bit budget (`254` or GLV `128`), while the live pipeline shrinks + to `effective_num_bits` before choosing `window_bits` and `windows_per_batch`. The current + fix sizes GLV MSMs and large non-GLV MSMs against the maximum reachable effective-bit + layout. +- `ecc_tests` builds after the rebase; remaining fixture-size test fallout has been local to + scalar-multiplication tests whose inputs exceeded the reduced shared fixture. +- The all-flow native/wasm matrix below is the current "do not regress" target. + +Remaining high-value review items: + +1. Keep the now-removed variable-window split out unless a new benchmark suite proves a + retuned model wins. +2. Decide whether the broad `parallel_for` rewrite belongs in this PR or should be split. +3. Remove or split unrelated build/debug/benchmark clutter before final review. +4. Review dedup as a targeted Chonk optimization, especially cap fallback tests and hint + discipline, but it is no longer the active `TestCircuitSizes` blocker. +5. Keep arena sizing under targeted regression tests for both ends of the workload spectrum: + large recursion-VK MSMs and small GLV Honk commitments. + +## Fixed Correctness Issue: Dedup Cluster Publication + +Earlier branch state failed `ChonkTests.TestCircuitSizes` with: + +```text +Assertion failed: (cluster_offsets_size == num_clus +Expected: 8193 +``` + +This pointed at the dedup Phase A bookkeeping, not at Chonk itself. + +In `dedup_phase_a_worker_hash`, `clusters_opened` is incremented when a singleton is promoted +inside the hash table, before the cluster is flattened into `cluster_members` and +`cluster_offsets`: + +- promotion: `clusters_opened++` +- flattening may stop early when `cluster_members_size + this_cluster_members > cluster_members_cap` +- the old invariant assumed every opened cluster was flattened: + `cluster_offsets_size == num_clusters + 1` + +So when the member cap was hit, `clusters_opened` could count clusters that were deliberately +left unflattened. The fix is to publish `num_clusters = cluster_offsets_size - 1`, i.e. the +number of flattened clusters that actually have `cluster_offsets` entries. Promoted but +unflattened entries then have no redirect and fall through to normal Pippenger as intended. + +## Optimization Inventory + +| Area | Inefficiency targeted | Activation / heuristic | Main code | Review risks | +| --- | --- | --- | --- | --- | +| Constantine signed-window recoding | Carry propagation and branchy per-window scalar decoding | Always used in round-parallel path; precomputes per-window slice params and selects bottom/localized/boundary paths | `compute_constantine_slice_params*`, `get_constantine_packed_digit`, SIMD x4 helpers | Boundary-bit correctness, top-window masking, endian/aliasing assumptions for `uint32_t` scalar view | +| Window-size selection | Bad `c` gives too many rounds or too many buckets | Native cost model `rounds * (n + 15 * buckets)`; WASM closed form using `target_load` from logical thread count | `choose_window_bits`, `window_bits_tuning_oversub_factor` | Platform calibration, small/large crossover, whether `n` should be post-GLV working scalars or original points | +| GLV split | Halve scalar bit length at cost of doubling point count | `n_input <= 2^13` native, `n_input <= 2^16` WASM, or caller supplies external GLV table | `GLV_SMALL_N_THRESHOLD`, `glv_threshold`, GLV split/double path | Sign convention for phi point, input scalar mutation/restoration asymmetry, memory pressure at crossover | +| Effective bit budget | Avoid windows above the actual largest scalar MSB | After Phase 1, `effective_num_bits` is highest non-empty `msb_hist` bin | Phase 1 `msb_hist` and `effective_num_bits` | Off-by-one in histogram bins; interaction with GLV halves and zero sentinel | +| Trivial MSM fallback | Pippenger scaffolding dominates very sparse or tiny active sets | `pts_per_thread < MIN_PTS_PER_THREAD_FOR_PIPPENGER` (`24`) after zero counting | `trivial_msm_threaded`; constant in header | Correct Montgomery lifecycle before `trivial_msm_threaded`; preserving `PolynomialSpan::start_index` semantics | +| Variable-window split | Mixed scalar sizes waste high-bit windows on small scalars | Removed after traced Chonk runs showed a net regression | deleted `choose_var_window_split` cost model and upper-region dispatch | Keep deleted unless a new benchmark suite proves a retuned split model wins | +| Round-parallel pipeline | Legacy per-thread work balance and repeated bucket reductions | Main path after dispatch: stages 1-7 over window batches sized by arena budget | staged pipeline in `pippenger_round_parallel` | Race-free cursor reuse, per-window capacity, Stage 1 and Stage 4 decode equivalence | +| SIMD digit extraction | Scalar decoding is compute-heavy and non-vectorized | `SIMD_BATCH = 64`; 4-wide `uint32_t` vector helpers selected by per-window path | x4 Constantine digit helpers and Stage 1/4 decode loops | Strict aliasing/layout assumptions, tail handling, all-included mask path | +| In-place histogram/prefix reuse | Avoid separate bucket-total and cursor buffers | `digit_cursors` is counts in Stage 1, per-thread offsets in Stage 2, scatter cursors in Stage 4 | Stage 1-4 `digit_cursors` reuse | Stage ordering, no read-after-overwrite mistakes, capacity and bucket 0 handling | +| Dedup pre-pass | Duplicate scalar values in witness/permutation polynomials cause repeated base-point additions | Explicit `dedup_hint`; long scalars only (`msb >= c_threshold`); caps: 16,384 clusters and 32,768 members | `dedup_phase_a_worker_hash`; hints wired through `CommitmentKey` | Fixed cap-publication bug; still review cap fallback tests, duplicate detection by one-limb fingerprint plus memcmp, and GLV interaction | +| Dedup patching | Keep hot Stage 4 loop dedup-free after first batch | First batch emits ordinary schedule, Phase A populates redirects, `dedup_patch_schedule_window` compacts skips; later batches omit skips up front | `dedup_patch_schedule_window`; Stage 1/4 dedup-known paths | First-batch vs later-batch equivalence, sign preservation on redirects, no stale redirects for capped-out clusters | +| Arena zoning | Reduce allocator churn and WASM fragmentation; bound resident scratch | `compute_arena_bytes_for_msm`, `BATCH_MEM_BUDGET = 32 MiB`, Zone P/W/S layout | arena sizer and Zone P/W/S layout in `pippenger_round_parallel` | Sizer and allocator formulas must stay exactly mirrored; must dominate runtime `effective_num_bits` layouts for GLV and non-GLV; absolute alignment; zero-initialization assumptions | +| Per-worker scratch overlay | Avoid summing all scratch lifetimes into memory budget | Phase A and Stage 6 scratch share Zone W union because they run in separate parallel phases | Phase A and Stage 6 Zone W scratch allocation | No overlapping lifetimes; worker id equals task id assumption; later refactors can violate this silently | +| Recursive affine bucket reduction | Replace projective bucket suffix sums with batched affine additions/doublings | Stage 6b always rebalances bucket ranges; stride is power-of-two; trivial stride <= 2 fallback | `recursive_affine_bucket_reduce_strided`; Stage 6b | Algebraic equivalence of `R`/`L`; batch-affine breakeven fallback; handling sparse windows and empty chunks | +| Dense bucket partials | Avoid sorted scans during cross-thread merge | Stage 6a writes dense per-thread bucket rows; Stage 6b looks up overlapping digit ranges directly | Stage 6a dense partials; Stage 6b merge | Boundary buckets shared by original chunks, overflow buffer sizing, present bitmap reset coverage | +| Batched MSM sharing | Chonk commits many MSMs over the same SRS prefix | Batch driver runs one MSM at a time but shares GLV-doubled SRS buffer and one max-sized arena | `pippenger_round_parallel_batched` | Pointer-range grouping assumes shared contiguous SRS allocation; no cross-MSM scalar scheduling is actually batched | + +## Dedup-Specific Review Checklist + +Dedup is now a targeted secondary optimization rather than the active Chonk blocker. It is +enabled only through hints, and public-transfer traces show the hints are concentrated on +duplicate-heavy Honk wires, `Z_PERM`, and small ECCVM polynomials. Review it as a separate +feature before judging the whole rewrite. + +1. Confirm the hinted call sites are the intended duplicate-heavy polynomials, not blanket + activation. Hints enter via `CommitmentKey::commit`, `batch_commit`, and `BatchBuilder`. +2. Keep cap fallback mechanically correct: flattened cluster count, `cluster_offsets_size`, + published redirects, and `extra_points` must describe the same set of clusters. + `clusters_opened` is diagnostic only and may include clusters that intentionally fall + through to normal Pippenger. +3. Add or strengthen tests where the cap is hit by many small clusters, not only one giant + cluster. The existing cap/carry test describes a mega-cluster shape, which would not catch + opened-but-unflattened many-cluster drift. +4. Check first-batch versus later-batch equivalence: Phase A is based on the first emitted + schedule, and redirects are reused for later windows after schedule patching. +5. Check GLV interaction: after GLV, duplicate scalar halves may not correspond to duplicate + original scalars, and points are `[P, phi(P)]`. Dedup is still algebraically valid if it + aggregates points attached to equal working scalar values, but tests should cover it. + +## Suggested Review Order + +1. Keep correctness green on the current branch, especially Chonk flow tests, wasm prove, + `BB_MSM_NO_GLV=1`, UltraHonk small-range tests, recursion-VK tests, and dedup + cap/fallback tests. +2. Lock down algebraic equivalence tests for the staged pipeline using random scalars, + sparse scalars, duplicate-heavy scalars, and GLV threshold boundaries. +3. Review memory safety after correctness: arena sizing mirrors, effective-bit schedule + sizing, worker scratch lifetimes, overflow bounds, and capacity assumptions. +4. Audit PR scope: split or remove benchmark/debug/build clutter and decide whether the global + thread-pool rewrite belongs with Pippenger. +5. Treat benchmark numbers as meaningful only after the scope and correctness questions above + are settled. Remaining calibrated constants include `GLV_SMALL_N_THRESHOLD`, + `BATCH_CAPACITY`, and the 32 MiB arena budget. + +## Independent Clutter / Split-Out Candidates + +Some changes in the branch are not intrinsically part of the Pippenger arithmetic rewrite. +They either change unrelated runtime behavior or add development scaffolding that makes the +review harder. Treat these as candidates for removal or separate PRs unless a bench proves +they are required for the headline result. + +| File / area | Change | Why it is clutter or too broad | Suggested disposition | +| --- | --- | --- | --- | +| `barretenberg/cpp/CMakePresets.json` | Removes the `WASI_SDK_PREFIX=/opt/wasi-sdk` default from the `wasm-threads` preset | Build-system regression; no MSM performance value | Revert in this PR | +| `barretenberg/cpp/src/barretenberg/bbapi/bbapi_chonk.cpp` | Adds `BB_SKIP_SANITY_VERIFY` | Benchmark/debug convenience that weakens the default prove path's self-check | Remove or keep only in a benchmark harness | +| `barretenberg/cpp/src/barretenberg/sumcheck/sumcheck_round.hpp` | Adds one `BB_BENCH_NAME` inside sumcheck | Profiling annotation outside MSM/commitment code | Move to profiling-only cleanup if desired | +| `barretenberg/cpp/src/barretenberg/vm2/constraining/prover.cpp` | Removes `AVM_MAX_MSM_BATCH_SIZE` batching control | Changes AVM prover behavior as a side effect of commitment batching | Revert unless the new commitment API requires it and AVM is measured | +| `barretenberg/cpp/src/barretenberg/benchmark/pippenger_bench/*` | Deletes `thread_scaling`, adds `small_msm_matrix`, rewrites `pippenger.bench` | Useful development tooling, but it expands review surface | Split into benchmark/support PR or keep only minimal reproducible benches | + +The global `parallel_for` rewrite in `barretenberg/cpp/src/barretenberg/common/thread.cpp` is +not simple clutter, but it is too broad for a Pippenger PR unless it is necessary for the +measured win. It changes scheduling for every `parallel_for` caller in barretenberg: sumcheck, +translator, VM2, ECCVM, and non-MSM prover code can all regress independently. Test this by +reverting/isolating the thread-pool rewrite and rerunning the native public-transfer bench. If +the MSM rewrite keeps most of the win, split the thread-pool change out. + +Similarly, `barretenberg/cpp/cmake/threading.cmake` adding `-msimd128` may support the wasm +SIMD copy path, but it changes wasm runtime requirements. Keep it only with a separate wasm +compatibility justification and benchmarks; otherwise remove it from the native-focused +Pippenger rewrite. + +Dedup hint plumbing in Oink, ECCVM, and Translator is not independent clutter, but it is +speculative. Keep only hints whose labels show meaningful `duplicate_excess / size` under +`BB_COMMITMENT_DEDUP_TRACE=1`; remove blanket hints that do not pay. + +## Instrumentation + +The branch has local MSM tracing and ablation switches in `scalar_multiplication.cpp`: + +- `BB_MSM_TRACE=1` emits one `BB_MSM_TRACE {...}` line per MSM. +- `BB_COMMITMENT_DEDUP_TRACE=1` emits one `BB_COMMITMENT_DEDUP_TRACE {...}` line per + commitment candidate, including Chonk polynomial labels when the commitment goes through a + batch. +- `BB_IPA_TRACE=1` emits the IPA opening size ladder: one start line and one line per IPA + reduction round. +- `BB_MSM_NO_GLV=1` disables inline and shared batched GLV. +- `BB_MSM_NO_DEDUP=1` ignores dedup hints and sizes the arena accordingly. + +Useful trace fields: + +- `n_input`, `n_working`, `n_active` +- `use_glv`, `external_glv` +- `dedup_hint`, `dedup_active`, `dedup_clusters`, `dedup_ms` +- `effective_num_bits`, `window_bits`, `windows_per_batch` +- `phase1_ms`, `pipeline_ms`, `total_ms` + +For the `ecdsar1+transfer_0_recursions+sponsored_fpc` flow, compare the full branch against: + +```bash +BB_MSM_TRACE=1 +BB_MSM_TRACE=1 BB_MSM_NO_GLV=1 +BB_MSM_TRACE=1 BB_MSM_NO_DEDUP=1 +BB_MSM_TRACE=1 BB_MSM_NO_GLV=1 BB_MSM_NO_DEDUP=1 +``` + +The fastest way to answer the current attribution question is to group trace lines by +`curve`, `n_input`, `use_glv`, and `dedup_clusters`. If the large `2^19` BN254 MSMs +still improve with `use_glv=false` and `dedup_clusters=0`, the staged Pippenger path is +likely a real contributor. If the wins concentrate in `n_input <= 8192` or duplicate-heavy +calls, the headline should be narrowed to GLV, fallback, and dedup-heavy workloads. + +For dedup attribution by Chonk polynomial, run the same flow with: + +```bash +BB_MSM_TRACE=1 BB_COMMITMENT_DEDUP_TRACE=1 BB_IPA_TRACE=1 +``` + +`BB_COMMITMENT_DEDUP_TRACE` reports exact duplicate density only for dedup-hinted +polynomials, so it should stay cheap enough to use on full Chonk flows while answering which +labels are actually responsible for the dedup win. Group by `label`, `size`, and +`duplicate_excess`; the labels with the largest `duplicate_excess / size` should line up with +the MSM trace lines that have large `dedup_clusters`. + +`BB_IPA_TRACE` has no dedup stats because IPA scalars are challenge-derived and call +`pippenger_unsafe` without a duplicate hint. Its purpose is to correlate the Grumpkin IPA +round ladder with `BB_MSM_TRACE` and `batch_mul_with_endomorphism` timings, especially the +`2^15 -> ... -> 1` sequence in ECCVM IPA. + +## Empirical Results + +### `ecdsar1+transfer_0_recursions+sponsored_fpc`, native (clang20-no-avm, 16 threads) + +Historical measurement on branch `lde/zacs-pippenger` before the Bernstein-Yang rebase, +compared with baseline `merge-train/barretenberg` (`4da6ab07f2c`), EC2 single run. The flow +matrix below includes later reruns after instrumentation, variable-split removal, and the +dedup cap publication fix. Because Bernstein-Yang has since landed separately, use these +numbers for workload attribution, not as a clean PR-vs-current-base diff. + +Native Chonk flow matrix: + +| Flow | Circuits | Baseline `ChonkAPI::prove` | Branch `ChonkAPI::prove` | Status | +| --- | --- | --- | --- | --- | +| `ecdsar1+transfer_0_recursions+sponsored_fpc` | 9 | 4.48 s | 3.43 s median | -23.4% | +| `ecdsar1+transfer_1_recursions+private_fpc` | 17 | 7.75 s | 6.10 s | -21.3% | + +| Stage | Baseline | Branch | Delta | +| --- | --- | --- | --- | +| `ChonkAPI::prove` (total) | 4.48 s | 3.46 s | -22.8% | +| `OinkProver::prove` (8 calls, avg/iter) | 891.5 ms (111.4 ms) | 568.6 ms (71.1 ms) | -36.2% | +| `Goblin::prove_eccvm` | 829.5 ms | 574.2 ms | -30.8% | +| `IPA::compute_opening_proof` | 292.1 ms | 170.0 ms | -41.8% | +| `MSM::batch_multi_scalar_mul` (oink, 38 calls) | 1.06 s (27.9 ms) | 659 ms (17.3 ms) | -37.8% | +| `CommitmentKey::commit` (oink wires, 53 calls) | 263.4 ms (4.97 ms) | 151.3 ms (2.85 ms) | -42.6% | +| `CommitmentKey::commit` (z_perm, 5 calls) | 189.2 ms (37.8 ms) | 133.7 ms (26.7 ms) | -29.4% | +| `batch_mul_with_endomorphism` (IPA, 15 calls) | 180.7 ms (12.05 ms) | 108.9 ms (7.26 ms) | -39.7% | +| `ChonkLoad` (msgpack decode, no MSM) | 100.1 ms | 106.8 ms | +6.7% (noise) | + +`IPA::compute_opening_proof` runs on random IPA challenge scalars with no `dedup_hint`, +so its -42% historical delta is attributable to the no-dedup path: round-parallel pipeline, +Bernstein-Yang inversion, and batch-affine bucket accumulation. Since Bernstein-Yang is now +in the base branch, current review should focus on the remaining Pippenger-side pieces of +that no-dedup path. The per-call oink-commit delta (-43%) is roughly the same magnitude, +implying dedup adds at most a few percent over the no-dedup baseline on this workload, not +the 20-30% earlier guess. + +### Native ablations, same flow + +All runs are single-run EC2 native (`clang20-no-avm`, 16 threads), comparing against the +uninstrumented branch wallclock of 3.46 s. The first ablation set was collected before the +dedup publication fix; the `BB_MSM_NO_GLV=1` abort is historical and has since been rerun +successfully. + +| Run | `ChonkAPI::prove` | Delta vs branch | Implication | +| --- | --- | --- | --- | +| Branch, uninstrumented | 3.46 s | baseline | Full rewrite result | +| `BB_MSM_NO_DEDUP=1` | 3.57 s | +0.11 s (+3.2%) | Dedup saves about 110 ms | +| `BB_MSM_NO_GLV=1 BB_MSM_NO_DEDUP=1` | 3.61 s | +0.15 s (+4.3%) | GLV adds about 40 ms on top of dedup | +| `BB_MSM_NO_GLV=1` | historical abort | - | Historical arena/cap symptom; current branch proves this path | + +Attribution against the full baseline-to-branch delta (`4.48 s -> 3.46 s`, 1.02 s saved): + +| Source | Approx saved | Share of baseline wallclock | Share of branch win | +| --- | --- | --- | --- | +| Dedup | 110 ms | ~2.5% | ~12% | +| GLV | 40 ms | ~1% | ~3% | +| Non-dedup, non-GLV rewrite | 870 ms | ~19.5% | ~85% | + +This materially changes the review posture: the rewrite's native win on this flow does not +stand or fall on dedup or GLV. The actual headline is the no-dedup, non-GLV path: staged +affine bucket reduction, batch-affine arithmetic, round-parallel scaffolding, Constantine +recoding, plus Bernstein-Yang in the historical baseline comparison. Since Bernstein-Yang is +now in merge-train, the remaining review should focus on the staged Pippenger machinery. The +no-dedup IPA evidence above is still useful: IPA drops 122 ms historically without duplicate +stripping. + +The old `BB_MSM_NO_GLV=1` abort hit the same `aligned_local + bytes <= bound_bytes` arena +assertion class as the wasm crash, but it no longer reproduces on the current branch. Treat +it as evidence for the fixed dedup cap / removed split-path sizing work, not as an open +arena blocker. + +### Triple-traced public-transfer ablation + +Same `ecdsar1+transfer_0_recursions+sponsored_fpc` native flow with +`BB_MSM_TRACE=1 BB_COMMITMENT_DEDUP_TRACE=1 BB_IPA_TRACE=1`. The extra per-coefficient +duplicate sort raises logging overhead to about 5%, so these deltas are relative to the +traced branch baseline of 3.66 s, not the uninstrumented 3.46 s. + +| Run | `ChonkAPI::prove` | Delta vs traced branch | Implication | +| --- | --- | --- | --- | +| Traced branch | 3.66 s | baseline | Full branch with tracing | +| `BB_MSM_NO_VAR_SPLIT=1` | 3.64 s | -20 ms | Variable split was a small wallclock regression before removal | +| `BB_MSM_NO_DEDUP=1` | 3.75 s | +90 ms | Dedup saves about 90 ms under tracing | + +Dedup payload by hinted label, sorted by `zero_count + duplicate_excess` ("bucket adds +avoided"): + +| Label | Calls | Total n | Zeros | Real dup excess | Avoided | Avoided / n | +| --- | --- | --- | --- | --- | --- | --- | +| `W_4` | 9 | 444,229 | 188,073 | 87,968 | 276,041 | 62.1% | +| `W_O` | 9 | 444,229 | 196,970 | 75,721 | 272,691 | 61.4% | +| `W_R` | 9 | 444,229 | 141,131 | 131,493 | 272,624 | 61.4% | +| `W_L` | 9 | 444,229 | 111,274 | 159,766 | 271,040 | 61.0% | +| `` commit path | 2 | 163,838 | 1 | 87,969 | 87,970 | 53.7% | +| `Z_PERM` | 9 | 444,229 | 1 | 69,576 | 69,577 | 15.7% | +| ECCVM `MSM_X*` / `MSM_Y*` | 1 each | 4,953 each | ~1,100 | ~3,000 | ~4,000 | 67-84% | +| ECCVM `PRECOMPUTE_DX/DY` | 1 each | 4,952 each | 1,085 | 3,494 | 4,579 | 92% | +| ECCVM `TRANSCRIPT_*` accumulators | 1 each | 4,952 each | 4,147-4,478 | 142-763 | 4,610-4,910 | 93-99% | + +The wires are the dominant target: `W_L/R/O/4` account for about 1.09M of 1.31M avoided +bucket additions across the prove, roughly 83% of the dedup payload. `Z_PERM` is the smallest +hinted Honk polynomial by density, but it has essentially no zeros; its 15.7% comes from real +constant-product stretches, not padding. The ECCVM hints are tiny in aggregate but high +density; transcript accumulator hints are mostly a single large zero cluster, so a simpler +zero-strip path may be cheaper there than the full dedup state machine. + +Structural zeros versus real repeats in the main Honk polynomials: + +| Label | Zero share | Real-dup share | +| --- | --- | --- | +| `W_L` | 25% | 36% | +| `W_R` | 32% | 30% | +| `W_O` | 44% | 17% | +| `W_4` | 42% | 20% | +| `Z_PERM` | 0% | 16% | + +This means dedup is not just an expensive zero-stripper. Wires are a mix of sparse padding and +genuine value reuse; `W_L` and `W_R` have more real duplicates than zeros, and `Z_PERM` is +purely real repeats. + +Order-joined MSM timing reproduces the dedup wallclock delta at the MSM level: + +| `n_input` bucket | Calls | Dedup-active calls | `NO_DEDUP - baseline` total_ms | Avg `dedup_clusters` | +| --- | --- | --- | --- | --- | +| 256-1k | 14 | 0 | -1 ms | - | +| 1k-4k | 27 | 0 | -7 ms | - | +| 4k-16k | 85 | 21 | +19 ms | 984 | +| 16k-64k | 37 | 21 | +29 ms | 1,931 | +| 64k-128k | 35 | 21 | +55 ms | 5,111 | +| 128k+ | 3 | 0 | -8 ms | - | +| Total heavy MSMs | 201 | 63 | +87 ms | - | + +About 63% of the dedup gain is in the 64k-128k bucket, exactly the Honk wire/z_perm commits. +The 4k-16k bucket contributes a smaller but real payoff from the ECCVM polynomials. + +Variable-window split looks like an anti-optimization on this Chonk flow: + +| Bucket | Calls | `split=true` in baseline | `NO_VAR_SPLIT - baseline` total_ms | +| --- | --- | --- | --- | +| 16k-64k | 37 | 14 | -17 ms | +| 64k-128k | 35 | 16 | -16 ms | +| Others | 129 | 1 | -11 ms | +| Total heavy MSMs | 201 | 31 | -44 ms | + +The predictor fires 31 times and loses about 1.4 ms per split decision. The current rule +accepts a split when predicted cost is at most 85% of unsplit; on this workload the predictor +is either overestimating split savings or the unsplit path has become fast enough that this +margin was too generous. The variable split path has since been removed from the branch. + +IPA structure from the same trace: one Grumpkin IPA opening uses `poly_length=32768`, 15 +rounds, 30 Pippenger calls, and 15 `batch_mul_with_endomorphism` calls. The round ladder is +`16384 -> ... -> 1`. None of these calls has a dedup hint, so the IPA part of the +historical speedup is entirely non-dedup: Bernstein-Yang inversion plus staged affine bucket +reduction, round-parallel pipeline, and batch-affine arithmetic. After the BY rebase, only +the staged Pippenger pieces remain part of this PR's diff. + +Updated attribution for this flow: + +| Component | Approx effect | Review implication | +| --- | --- | --- | +| Non-dedup, non-GLV, non-var-split Pippenger path | ~960 ms historical saved including BY | Main headline; BY is now baseline, so focus review on remaining staged MSM machinery | +| Dedup | ~90 ms saved | Real and well targeted; mostly Honk wires | +| GLV | ~40 ms saved | Small contributor from prior ablation | +| Variable-window split | ~44 ms regression | Removed; keep it out unless a new benchmark proves otherwise | + +Concrete actions from this trace: + +1. Keep `choose_var_window_split` removed unless a new benchmark suite justifies rebuilding it. +2. Keep dedup as a targeted Chonk optimization; the cap-publication bug is fixed, but tests + should still cover cap fallback shapes. +3. Consider replacing the ECCVM transcript accumulator dedup case with a cheaper zero-heavy + path if it remains measurable after the correctness work. + +### `ecdsar1+transfer_1_recursions+private_fpc`, native + +Baseline `merge-train/barretenberg` (`4da6ab07f2c`) proves this flow in 7.75 s. The current +branch, after variable-split removal and the dedup cap publication fix, proves it in 6.10 s +single-run: a 1.65 s / 21.3% speedup. + +An earlier branch state aborted before timing could be collected: + +```text +aligned_local + bytes <= bound_bytes +1.70 MB needed vs 1.21 MB cap +``` + +This flow is roughly "more of the same" compared with transfer_0: 17 circuits vs 9 circuits, +and baseline wallclock scales from 4.48 s to 7.75 s. Per-circuit baseline time is slightly +lower on transfer_1 (456 ms vs 498 ms), so the private-recursive flow is not a qualitatively +different workload. The current branch now proves this larger real Chonk workload, so the +historical native speedup signal holds beyond the shorter public-transfer flow. + +Baseline slices: + +| Stage | Baseline time | Calls x avg | +| --- | --- | --- | +| `Chonk::accumulate_and_fold` | 4.12 s | 16 x 257.7 ms | +| Dominant Mega `OinkProver::prove` | 2.14 s | 16 x 133.5 ms | +| `commit_to_wires` | 855.8 ms | 17 x 50.3 ms | +| `commit_to_z_perm` | 782.4 ms | 17 x 46.0 ms | +| `commit_to_lookup_counts_and_w4` | 387.5 ms | 17 x 22.8 ms | +| `commit_to_logderiv_inverses` | 225.2 ms | 17 x 13.2 ms | +| `HypernovaFoldingProver::sumcheck` | 894.3 ms | 16 x 55.9 ms | +| `Goblin::prove_eccvm` | 995.0 ms | - | +| `IPA::compute_opening_proof` | 276.3 ms | - | +| `BatchedHonkTranslatorProver::prove` | 944.5 ms | - | +| `MSM::batch_multi_scalar_mul` (top context) | 2.25 s | 70 x 32.1 ms | + +The prior abort is now best treated as a removed-path/cap-publication correctness symptom, +not proof that the whole unsplit arena model is broken. Variable-split removal deleted the +split-specific sizing branch, and the dedup cap fix prevents promoted-but-unflattened +clusters from being published. + +### `BB_MSM_TRACE=1` aggregates, same flow + +525 MSM calls captured. Logging overhead 3.46 -> 3.52 s (~2%). + +| Path | Calls | Total | Avg | +| --- | --- | --- | --- | +| `pippenger_round_parallel` (heavy) | 201 | 1186 ms | 5.90 ms | +| `trivial_pre` / `trivial_post_profile` | 312 | ~0 ms | 0 | +| `empty` | 12 | 0 ms | 0 | + +Heavy-path breakdown by `n_input`: + +| `n_input` | Calls | Total | Avg | Dedup-active calls | Avg `dedup_clusters` | +| --- | --- | --- | --- | --- | --- | +| 256-1k | 14 | 9 ms | 0.64 ms | 0 | - | +| 1k-4k | 27 | 29 ms | 1.07 ms | 0 | - | +| 4k-16k | 85 | 90 ms | 1.06 ms | 21 | 985 | +| 16k-64k | 37 | 336 ms | 9.08 ms | 21 | 1930 | +| **64k-128k** | **35** | **543 ms** | **15.51 ms** | **21** | **5111** | +| 128k+ | 3 | 179 ms | 59.67 ms | 0 | - | + +Observations: + +- The 64k-128k bucket dominates wallclock (543 ms = 15% of total prove). 5111 clusters on + 88-128k inputs corresponds to ~5-7% cluster density - matches the "few huge clusters" + shape from structural-padding zeros and constant z_perm regions. +- Dedup fires on 63 of 201 heavy calls, distributed as exactly 21 in each of the 4k-16k, + 16k-64k, 64k-128k buckets. That is 7 dup-hinted commits per prover stage x 3 prover + stages, i.e. wires + z_perm getting consistent dedup activation. No + `dedup_hint=true,dedup_active=false` cases were observed on this flow. +- 128k+ MSMs (ECCVM/IPA SRS commits) correctly run without dedup; their scalars are + challenges and zero-padding does not appear. +- Trace currently reports `dedup_clusters` but not `dedup_members_flattened` / + `dedup_members_dropped`. Adding those would make cap-fallback behavior directly observable + rather than relying only on code reading and targeted tests. + +### Arena-overflow reproductions and current diagnosis + +Earlier branch states had several `aligned_local + bytes <= bound_bytes` or dedup-layout +assertions. The first group is closed, but later CI found a second arena-sizing bug that is +independent of variable split and dedup publication. + +| Reproduction | Symptom | Current branch outcome | +| --- | --- | --- | +| transfer_0 native + `BB_MSM_NO_GLV=1` | Arena assertion during ablation | Proves in 3.47 s | +| transfer_0 wasm | ~8% arena overflow, 674 KB needed vs 624 KB cap | Proves in 8.71 s | +| transfer_1 native, no flags | ~40% arena overflow, 1.70 MB needed vs 1.21 MB cap | Proves in 6.16 s / 6.10 s single-runs | +| dedup cap fallback | `cluster_offsets_size == num_clusters + 1` drift | Fixed by publishing only flattened clusters | +| `HonkRecursionConstraintTestWithoutPredicate/2.GenerateVKFromConstraints` | large BN254 non-GLV arena assertion, schedule allocation `26,454,272` bytes vs `25,505,329` Zone S cap | Fixed by sizing large non-GLV MSMs against max reachable `effective_num_bits` layout | +| `RangeTests/0.LimbedRangeConstraint133Bits` | small BN254 GLV arena assertion, `507,712` bytes vs `488,933` cap | Fixed by applying the same effective-bit layout sizing to GLV MSMs | + +Current diagnosis: there are at least three distinct fixed correctness issues in the arena / +dedup area, not one generic failure mode. Variable-split removal closed the old split-path +sizing branch, the dedup publication fix closed promoted-but-unflattened clusters, and the +latest arena fix makes the pre-Phase-1 sizer dominate the runtime `effective_num_bits` +schedule choice. Arena zoning remains a top review area because every future Zone P/W/S +allocation change must update both the sizer and the typed allocator layout. + +### Two preset/cmake regressions noted while reproducing + +Outside MSM code itself, the branch silently changed wasm/cmake behavior: + +- `CMakePresets.json` removed the `WASI_SDK_PREFIX=/opt/wasi-sdk` default from the + `wasm-threads` preset environment block. Builds now fail with + `#include ` not found unless `WASI_SDK_PREFIX` is exported externally. +- `cmake/threading.cmake` added `-msimd128` for WASM multithreaded builds. Hot loops + (Phase 5a sched -> pts copy) depend on `v128.load/store` at runtime, so any older + V8/wasmtime would now fail differently. The bench machine runs wasmtime 43, which is + fine; production wasm consumers should be checked. + +### Full bench matrix: all 11 IVC flows x {native, wasm} x {baseline, branch} + +Single-run, EC2 16 threads. Native: `clang20-no-avm`. WASM: `wasm-threads` + wasmtime 43 +with `-W threads=y -W shared-memory=y -S threads=y`. Branch state for these numbers has +variable-split removed and the dedup cap publication fix. Baseline is historical +`merge-train/barretenberg` (`4da6ab07f2c`), so after the Bernstein-Yang rebase the matrix is +best used as the workload coverage and "do not regress" target rather than a clean diff +against today's merge-train. All numbers are `ChonkAPI::prove` wallclock in seconds. + +| Flow | Base nat | Branch nat | Native delta | Base wasm | Branch wasm | WASM delta | +| --- | ---: | ---: | ---: | ---: | ---: | ---: | +| `deploy_ecdsar1+sponsored_fpc` | 5.47 | 4.27 | -21.9% | 14.83 | 10.88 | -26.6% | +| `deploy_schnorr+sponsored_fpc` | 5.19 | 3.99 | -23.1% | 14.04 | 10.15 | -27.7% | +| `ecdsar1+amm_add_liquidity_1_recursions+sponsored_fpc` | 8.69 | 6.97 | -19.8% | 23.64 | 18.11 | -23.4% | +| `ecdsar1+deploy_tokenContract_with_registration+sponsored_fpc` | 5.82 | 4.58 | -21.3% | 15.66 | 11.74 | -25.0% | +| **`ecdsar1+storage_proof_7_layers+sponsored_fpc`** | **13.60** | **11.96** | **-12.1%** | **43.28** | **37.11** | **-14.3%** | +| `ecdsar1+token_bridge_claim_private+sponsored_fpc` | 5.19 | 4.07 | -21.6% | 14.00 | 10.41 | -25.6% | +| `ecdsar1+transfer_0_recursions+private_fpc` | 6.98 | 5.54 | -20.6% | 19.02 | 14.26 | -25.0% | +| `ecdsar1+transfer_0_recursions+sponsored_fpc` | 4.48 | 3.46 | -22.8% | 11.92 | 8.71 | -26.9% | +| `ecdsar1+transfer_1_recursions+private_fpc` | 7.74 | 6.16 | -20.4% | 20.99 | 15.84 | -24.5% | +| `ecdsar1+transfer_1_recursions+sponsored_fpc` | 5.10 | 3.96 | -22.4% | 13.67 | 10.09 | -26.2% | +| `schnorr+deploy_tokenContract_with_registration+sponsored_fpc` | 5.55 | 4.32 | -22.2% | 14.99 | 11.08 | -26.1% | +| **Sum** | **73.81** | **59.28** | **-19.7%** | **206.04** | **158.38** | **-23.1%** | + diff --git a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp index a1bd31c80db4..10cf1141652b 100644 --- a/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp +++ b/barretenberg/cpp/src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.cpp @@ -1,5 +1,7 @@ #include "./scalar_multiplication.hpp" +#include "./pippenger_arena_layout.hpp" +#include "./pippenger_constantine.hpp" #include "barretenberg/common/assert.hpp" #include "barretenberg/common/thread.hpp" #include "barretenberg/ecc/curves/bn254/bn254.hpp" @@ -16,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -115,434 +118,12 @@ template return static_cast(lo_slice | (hi_slice << lo_bits)); } -/** - * @brief Per-window precomputed slice parameters for the carry-less signed-Booth window - * recoding (after Constantine `signedWindowEncoding` / `getSignedFullWindowAt`, - * `constantine/math/arithmetic/bigints.nim`). Computed once per window outside the - * Stage 1 / Stage 4 inner loops; the per-(point, window) hot path is then 2 i32 - * loads + a fixed bit-twiddle, no branches and no per-iter address arithmetic. - * Carry-less because every non-bottom window's c+1-bit read shares its boundary bit - * with the previous window — the bit a non-overlapping recoder would carry. - */ -// Precomputed per-window slice params for Constantine's signed-Booth recoder. -// -// `slice_localised_to_one_u64`: true iff every bit of the c+1-bit window lives inside a -// single uint64 limb. ~75% of windows on typical 254-bit scalars with c ∈ [12, 18] -// (lookback bits at non-boundary positions) hit this and take the fast path: one load, -// one shift, one mask. The slow path is the boundary-straddling case + the synthetic- -// lookback bottom window. -struct ConstantineSliceParams { - uint32_t lo_mask; - uint32_t hi_mask; - uint32_t lo_limb; - uint32_t hi_limb; // == lo_limb + 1, except clamped to last valid limb at the top window - uint32_t lo_off; - uint32_t lo_bits; - bool slice_localised_to_one_u64; -}; - -/** - * @brief Compute the Constantine slice params for a window starting at absolute bit position - * `bit_offset` (= Σ_{k(lookback_bit / LIMB_BITS); - sp.lo_off = static_cast(lookback_bit & (LIMB_BITS - 1)); - sp.lo_bits = static_cast(LIMB_BITS - sp.lo_off < bits_to_read ? LIMB_BITS - sp.lo_off : bits_to_read); - const uint32_t hi_bits = static_cast(bits_to_read) - sp.lo_bits; - // window_bits+1 ≤ 32 for our windows ⇒ lo_bits ≤ 32 ⇒ mask fits in uint32. - sp.lo_mask = (uint32_t{ 1 } << sp.lo_bits) - 1; - // If the natural hi-limb read would land past the end of the scalar's storage, - // clamp `hi_limb` to a safe in-range index and mask its contribution to zero. The - // top window's hi_bits worth of bits are conceptually zero (scalar < 2^num_bits ≤ - // num_windows·window_bits). Re-reading lo_limb under a zero mask keeps the slow - // path's two unconditional limb loads branch-free. - if (static_cast(sp.lo_limb) + 1 >= num_uint64_limbs) { - sp.hi_limb = sp.lo_limb; - sp.hi_mask = 0; - } else { - sp.hi_limb = sp.lo_limb + 1; - sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1; - } - // Fast path: the full (window_bits+1)-bit window lives inside `lo_limb`. hi_bits == 0 - // captures both the in-limb case (window doesn't straddle a 64-bit boundary) and the - // clamped top-window case (above) where hi_mask was forced to 0. - sp.slice_localised_to_one_u64 = (hi_bits == 0); - } - return sp; -} - -/** - * @brief Read (window_bits+1) bits from `scalar_data` (uint64 limbs) using precomputed - * slice params and apply Constantine's signedWindowEncoding to produce a - * `(sign | bucket)` packed digit. Inner-loop body for Stage 1 / Stage 4 — - * fully inlined. - * - * Takes the slice params as scalar value parameters rather than a struct reference - * so the compiler reliably holds them in registers across the inner loop. (Passing - * a const-ref to a small struct sometimes blocks the same hoisting an explicit - * unpack-then-pass guarantees; we saw exactly this regression with the variable-c - * split params before unpacking.) - * - * `slice_localised_to_one_u64` selects the single-load fast path: ~75% of windows - * on typical 254-bit scalars (window_bits ∈ [12, 18]) hit this. Because the slice - * params are loop-invariant within a window, the branch resolves once per inner- - * loop iter and the inner branch predictor pins it. - */ -[[nodiscard]] [[gnu::always_inline]] inline uint32_t get_constantine_packed_digit(const uint64_t* scalar_data, - uint32_t lo_limb, - uint32_t hi_limb, - uint32_t lo_off, - uint32_t lo_bits, - uint32_t lo_mask, - uint32_t hi_mask, - bool slice_localised_to_one_u64, - size_t window_bits) noexcept -{ - uint64_t raw_wide = 0; - if (slice_localised_to_one_u64) { - // Fast path: one load + shift + mask. hi_part vanishes (hi_mask == 0); skip it. - raw_wide = (scalar_data[lo_limb] >> lo_off) & lo_mask; - } else if (lo_mask == 0) { - // Bottom-window fast path: synthetic-zero lookback bit, so the lo_part contribution is - // always 0 (lo_mask == 0). Skip the lo limb load entirely. lo_bits == 1 here, so the - // shift plants the window_bits-bit slice at bits 1..window_bits with bit 0 = 0. - // sp_lo_mask is loop-invariant within a window but is a runtime stack value, so the - // compiler does NOT constant-fold the `(s_lo >> lo_off) & 0 = 0` path inside the - // boundary branch; this explicit check saves ~3 ALU ops per scalar on the bottom window. - raw_wide = (scalar_data[hi_limb] & hi_mask) << lo_bits; - } else { - // Slow path: window straddles a uint64 boundary. - const uint64_t s_lo = scalar_data[lo_limb]; - const uint64_t s_hi = scalar_data[hi_limb]; - const uint64_t lo_part = (s_lo >> lo_off) & lo_mask; - const uint64_t hi_part = (s_hi & hi_mask) << lo_bits; - raw_wide = lo_part | hi_part; - } - // raw fits in window_bits+1 ≤ 32 bits, safe to narrow. - const uint32_t raw = static_cast(raw_wide); - - // signedWindowEncoding(raw, window_bits). raw fits in window_bits+1 bits; bit - // `window_bits` is the sign indicator. - // - // The conditional-negate trick `((encode + neg_mask) ^ neg_mask)` is the standard - // branchless idiom. We use the equivalent `(encode - neg) ^ neg_mask` to break the - // latency chain: `encode - neg` and `neg_mask = -neg` can issue in parallel (both - // depend only on `neg` / `encode`), whereas `encode + neg_mask` first waits for - // `neg_mask` to materialise. Saves one cycle on the inner-loop critical path - // (neg → neg_mask → +neg_mask → ^neg_mask → &val_mask vs neg → {neg_mask, enc_neg} - // in parallel → ^neg_mask → &val_mask). Identical result by: - // neg=0: enc_neg = encode, xored = encode ^ 0 = encode. ✓ - // neg=1: enc_neg = encode−1, xored = (encode−1) ^ −1 = ~(encode−1) = −encode. ✓ - const uint32_t neg = (raw >> window_bits) & uint32_t{ 1 }; - const uint32_t neg_mask = uint32_t{ 0 } - neg; // 0 or 0xFFFFFFFF - const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1; - const uint32_t encode = (raw + 1) >> 1; - const uint32_t bucket_idx = ((encode - neg) ^ neg_mask) & val_mask; - - // Pack into (sign | bucket). Stage 1 uses the bucket bits for histograms; Stage 4 - // stores only the sign bit because Stage 6 recovers bucket magnitude from bucket_start. - return (neg << 31) | bucket_idx; -} - -// 128-bit SIMD-friendly 4-wide variant of get_constantine_packed_digit. Computes 4 packed -// digits in parallel via GCC's vector_size extension, which lowers to native SIMD on x86 -// (SSE2), ARM (NEON), and WASM (wasm-simd128). The branch on slice path is hoisted from -// the per-call site to the per-window outer loop, so Stage 1 / Stage 4 callers select the -// localised / bottom / boundary specialisation once per window. -// -// We index the scalar via a `const uint32_t*` view rather than the natural `uint64_t*`: -// each lane is one uint32, so a 128-bit SIMD register holds 4 (raw, encode, bucket, …) -// values. `scalar.data` is a `std::array` whose byte layout is identical to -// `uint32_t[8]` on every target we ship to (x86 / ARM / WASM are all little-endian, and the -// codebase already assumes this layout in many places — `from_montgomery`, `uint256_t`, -// etc.). The reinterpret_cast is the same alias pattern. -// -// Returns the four packed digits in `out[0..3]`. The caller scatters them to the histogram -// (Stage 1) or schedule (Stage 4) individually, since the consuming write is a -// non-vectorisable scatter. Switching from 2-wide uint64 to 4-wide uint32 doubles the -// compute throughput per SIMD instruction at the cost of slightly more straddle hits (the -// "localised" fast-path rate drops from ~77 % to ~50 % at c=14), but compute dominates -// per-iter cost so the net win is positive. -using SimdU32x4 = uint32_t __attribute__((vector_size(16))); - -// Helpers return `SimdU32x4` directly so the v128 stays in the SIMD register file end-to-end. -// Wrapping in a 4-uint32 struct round-tripped the v128 through 4 scalar memory slots. - -// uint32-indexed Constantine slice params, mirroring `ConstantineSliceParams` but with -// limb indices measured in 32-bit (rather than 64-bit) chunks. Computed once per window in -// `compute_constantine_slice_params_u32`; consumed by the SIMD x4 helpers below. -struct ConstantineSliceParamsU32 { - uint32_t lo_mask; - uint32_t hi_mask; - uint32_t lo_limb; // u32 limb index of the lookback bit - uint32_t hi_limb; // == lo_limb + 1, clamped to last in-range u32 limb at the top window - uint32_t lo_off; // bit-offset of the lookback bit within `lo_limb` - uint32_t lo_bits; // # bits read from `lo_limb` (also acts as the hi_part left-shift amount) - bool slice_localised_to_one_u32; - bool is_bottom_window; -}; - -[[nodiscard]] inline ConstantineSliceParamsU32 compute_constantine_slice_params_u32(size_t bit_offset, - size_t window_bits, - size_t num_u32_limbs) noexcept -{ - constexpr size_t LIMB_BITS_U32 = 32; - ConstantineSliceParamsU32 sp; - if (bit_offset == 0) { - sp.lo_limb = 0; - sp.hi_limb = 0; - sp.lo_off = LIMB_BITS_U32 - 1; - sp.lo_bits = 1; - sp.lo_mask = 0; - sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1; - sp.slice_localised_to_one_u32 = false; - sp.is_bottom_window = true; - } else { - const size_t lookback_bit = bit_offset - 1; - const size_t bits_to_read = window_bits + 1; - sp.lo_limb = static_cast(lookback_bit / LIMB_BITS_U32); - sp.lo_off = static_cast(lookback_bit & (LIMB_BITS_U32 - 1)); - const uint32_t in_lo = static_cast(LIMB_BITS_U32 - sp.lo_off); - sp.lo_bits = (in_lo < static_cast(bits_to_read)) ? in_lo : static_cast(bits_to_read); - const uint32_t hi_bits = static_cast(bits_to_read) - sp.lo_bits; - sp.lo_mask = (sp.lo_bits == LIMB_BITS_U32) ? ~uint32_t{ 0 } : ((uint32_t{ 1 } << sp.lo_bits) - 1); - if (static_cast(sp.lo_limb) + 1 >= num_u32_limbs) { - sp.hi_limb = sp.lo_limb; - sp.hi_mask = 0; - } else { - sp.hi_limb = sp.lo_limb + 1; - sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1; - } - sp.slice_localised_to_one_u32 = (hi_bits == 0); - sp.is_bottom_window = false; - } - return sp; -} - -// Gather 4 disjoint uint32 values into one v128 via wasm v128.load32_lane. On WASM this -// is 1 splat + 3 load32_lane (4 ops); brace-init `{a, b, c, d}` with runtime values emits -// 4 scalar i32.load + 1 splat + 3 replace_lane (8 ops). On native it falls back to brace- -// init which clang lowers to NEON ins / SSE2 pinsrd. -[[nodiscard]] [[gnu::always_inline]] inline SimdU32x4 gather_x4_u32( - const uint32_t* p0, const uint32_t* p1, const uint32_t* p2, const uint32_t* p3, uint32_t idx) noexcept -{ -#ifdef __wasm_simd128__ - v128_t v = wasm_i32x4_splat(0); - v = wasm_v128_load32_lane(p0 + idx, v, 0); - v = wasm_v128_load32_lane(p1 + idx, v, 1); - v = wasm_v128_load32_lane(p2 + idx, v, 2); - v = wasm_v128_load32_lane(p3 + idx, v, 3); - return reinterpret_cast(v); -#else - return SimdU32x4{ p0[idx], p1[idx], p2[idx], p3[idx] }; -#endif -} - -// All four mask / constant v128s (lo_mask_v, hi_mask_v, one_v, val_mask) are loop-invariant -// within a window. Callers build them ONCE per window in the outer-w loop and pass them in, -// so the inner-i compute loop has zero v128.const / splat / shl+sub for the masks. -// `neg_mask = -neg` uses GCC vector-ext unary minus which lowers to `i32x4.neg` on WASM. -// -// Helpers write the v128 result DIRECTLY into the caller's stack buffer via an aligned -// `v128.store` (or equivalent on native). No return-by-value, no temporary, no memcpy — -// the v128 register flows from the bit-pack pipeline straight into the destination buffer. -[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_localised(uint32_t* dst, - const uint32_t* scalar_data_0, - const uint32_t* scalar_data_1, - const uint32_t* scalar_data_2, - const uint32_t* scalar_data_3, - uint32_t lo_limb, - uint32_t lo_off, - SimdU32x4 lo_mask_v, - SimdU32x4 one_v, - SimdU32x4 val_mask, - uint32_t window_bits) noexcept -{ - const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb); - const SimdU32x4 raw = (lo >> lo_off) & lo_mask_v; - const SimdU32x4 neg = (raw >> window_bits) & one_v; - const SimdU32x4 neg_mask = -neg; - const SimdU32x4 encode = (raw + one_v) >> 1; - const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; - const SimdU32x4 packed = (neg << 31) | bucket; -#ifdef __wasm_simd128__ - wasm_v128_store(dst, reinterpret_cast(packed)); -#else - *reinterpret_cast(dst) = packed; -#endif -} +// Constantine signed-Booth window recoder (scalar + SIMD x4 paths) lives in +// pippenger_constantine.hpp. -[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_bottom(uint32_t* dst, - const uint32_t* scalar_data_0, - const uint32_t* scalar_data_1, - const uint32_t* scalar_data_2, - const uint32_t* scalar_data_3, - uint32_t hi_limb, - uint32_t lo_bits, - SimdU32x4 hi_mask_v, - SimdU32x4 one_v, - SimdU32x4 val_mask, - uint32_t window_bits) noexcept -{ - const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb); - const SimdU32x4 raw = (hi & hi_mask_v) << lo_bits; - const SimdU32x4 neg = (raw >> window_bits) & one_v; - const SimdU32x4 neg_mask = -neg; - const SimdU32x4 encode = (raw + one_v) >> 1; - const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; - const SimdU32x4 packed = (neg << 31) | bucket; -#ifdef __wasm_simd128__ - wasm_v128_store(dst, reinterpret_cast(packed)); -#else - *reinterpret_cast(dst) = packed; -#endif -} - -[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_boundary(uint32_t* dst, - const uint32_t* scalar_data_0, - const uint32_t* scalar_data_1, - const uint32_t* scalar_data_2, - const uint32_t* scalar_data_3, - uint32_t lo_limb, - uint32_t hi_limb, - uint32_t lo_off, - uint32_t lo_bits, - SimdU32x4 lo_mask_v, - SimdU32x4 hi_mask_v, - SimdU32x4 one_v, - SimdU32x4 val_mask, - uint32_t window_bits) noexcept -{ - const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb); - const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb); - const SimdU32x4 lo_part = (lo >> lo_off) & lo_mask_v; - const SimdU32x4 hi_part = (hi & hi_mask_v) << lo_bits; - const SimdU32x4 raw = lo_part | hi_part; - const SimdU32x4 neg = (raw >> window_bits) & one_v; - const SimdU32x4 neg_mask = -neg; - const SimdU32x4 encode = (raw + one_v) >> 1; - const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask; - const SimdU32x4 packed = (neg << 31) | bucket; -#ifdef __wasm_simd128__ - wasm_v128_store(dst, reinterpret_cast(packed)); -#else - *reinterpret_cast(dst) = packed; -#endif -} - -// Path-selector enum (used by Stage 1 / Stage 4 to dispatch on the SIMD specialisation -// once per window rather than once per scalar). -enum class ConstantineSlicePath : uint8_t { - Localised = 0, - Bottom = 1, - Boundary = 2, -}; - -[[nodiscard]] [[gnu::always_inline]] inline ConstantineSlicePath classify_slice_path_u32( - const ConstantineSliceParamsU32& sp) noexcept -{ - if (sp.is_bottom_window) { - return ConstantineSlicePath::Bottom; - } - if (sp.slice_localised_to_one_u32) { - return ConstantineSlicePath::Localised; - } - return ConstantineSlicePath::Boundary; -} - -[[nodiscard]] inline uint32_t choose_window_bits( - size_t num_points, size_t num_bits, size_t n_input, size_t num_logical_threads, bool use_rebalance) noexcept -{ - constexpr uint32_t MAX_C = 20; - uint32_t best = 2; - -#ifdef __wasm__ - // Closed-form for wasm: c = ⌊log2(num_points / target_load)⌋ + 1, where target_load is - // num_logical_threads × 2/3 above n_input=4096 and × 1/3 below — the per-bucket density - // that keeps the batched-affine drains amortised in each regime. - static_cast(num_bits); - static_cast(use_rebalance); - const size_t target_load = (n_input > 4096) ? (num_logical_threads * 2 / 3) : (num_logical_threads / 3); - if (target_load == 0 || num_points <= target_load) { - best = 2; - } else { - // ⌊log2(num_points / target_load)⌋ + 1 - const size_t ratio = num_points / target_load; - const uint32_t lg = static_cast(numeric::get_msb(ratio)); - best = lg + 1; - if (best < 2) { - best = 2; - } else if (best >= MAX_C) { - best = MAX_C - 1; - } - } -#else - // Native: linear cost model `cost = rounds · (n + 15·B)` with BUCKET_ACC_COST=15. - // The closed-form WASM formula above has not been recalibrated for native — keep the - // existing native model until that's done. - static_cast(n_input); - static_cast(num_logical_threads); - static_cast(use_rebalance); - uint64_t best_cost = static_cast(-1); - for (uint32_t window_bits = 2; window_bits < MAX_C; ++window_bits) { - const uint64_t rounds = (num_bits + 2 + window_bits - 1) / window_bits; - const uint64_t buckets = (uint64_t{ 1 } << (window_bits - 1)) + 1; - const uint64_t n = num_points; - constexpr uint64_t BUCKET_ACC_COST = 15; - const uint64_t cost = rounds * (n + (buckets * BUCKET_ACC_COST)); - if (cost < best_cost) { - best_cost = cost; - best = window_bits; - } - } -#endif - - return best; -} - -// Variable-window-bits Pippenger schedule. SPLIT mode covers bits [0, b_star) with `window_bits_lo` -// windows iterated by every non-zero scalar, and bits [b_star, NUM_BITS) with -// `window_bits_hi < window_bits_lo` windows iterated by `idx_large` only (scalars whose msb sits in -// the upper region). NO_SPLIT mode is a single region of uniform window-bits. -inline constexpr size_t VAR_WINDOW_MAX_WINDOWS = 128; - -// Above this N, GLV's 2× point-count cost outweighs the windows-halved benefit. The -// crossover is platform-specific: WASM keeps GLV up to 2^16 (V8/wasmtime's branchless -// bias-decode is slow enough that halving num_windows still pays at large N), while -// native's faster decode makes the 2× point-count dominate above 2^13. Empirically -// calibrated against chonk-prove fixtures — see the call sites for the original sweep -// notes. -#ifdef __wasm__ -inline constexpr size_t GLV_SMALL_N_THRESHOLD = size_t{ 1 } << 16; -#else -inline constexpr size_t GLV_SMALL_N_THRESHOLD = size_t{ 1 } << 13; -#endif +// `choose_window_bits` and `build_var_window_schedule` are defined inline in +// `pippenger_arena_layout.hpp` so the test suite can build identical schedules. +// `VAR_WINDOW_MAX_WINDOWS` and `VariableWindowSchedule` likewise live there. // Sentinel value for `msb_per_scalar[i]` when scalar i is zero. uint8_t fits the 254 valid msb // positions (0..253) plus this sentinel; matching `msb_hist` bin layout uses bin 0 = zero count @@ -593,7 +174,8 @@ inline constexpr uint32_t DEDUP_INVALID_EXTRA = ~uint32_t{ 0 }; // branch never fires anyway (the end-of-loop drain catches the residue). Keeping it // constexpr lets the compiler turn the per-iter `if (pair_count >= BATCH_CAPACITY)` into // a compare-against-immediate and fold the drain-trigger condition into the loop shape. -inline constexpr size_t BATCH_CAPACITY = 256; +// `BATCH_CAPACITY` is defined in `pippenger_arena_layout.hpp` so the layout struct can +// reference it without depending on this TU. inline int msb_of_2limb(uint64_t lo, uint64_t hi) noexcept { @@ -630,251 +212,10 @@ inline void record_msb(int msb, uint8_t& dst, std::array& th_hist ++th_hist[static_cast(msb) + 1]; } -struct VariableWindowSchedule { - size_t W_lo = 0; // # of lower windows (use window_bits_lo) - size_t W_hi = 0; // # of upper windows (use window_bits_hi); 0 → NO_SPLIT - size_t num_windows = 0; // = W_lo + W_hi - std::array window_bits_per_window{}; // window_bits_w for each w - std::array bit_base{}; // B_w = Σ_{k num_buckets{}; // 2^(window_bits_w - 1) + 1 -}; - -// One window range. The driver iterates each region's windows in batches. Bundles the -// per-region numerics that bind the for-loop bounds + the lambda call args; the per-region -// msb-filter behaviour is selected via the `bool is_upper` argument to run_batch (kept as -// a separate flag for codegen reasons — clang constant-folds the literal `false` / `true` -// at the call site through the inlined lambda body, eliding the upper-only branch from the -// lower region's hot loops; a `uint8_t threshold` field on this struct does not get the -// same treatment and costs ~6% Stage 6a wall on chonk). -struct RegionView { - size_t window_start = 0; // first window index in the global schedule - size_t window_count = 0; // # of windows owned by this region - size_t window_bits_R = 0; // typical c (matches window_bits_per_window for all but possibly the last window) - size_t B_R = 0; // typical bucket count = (1 << (window_bits_R - 1)) + 1 - size_t capacity_R = 0; // schedule capacity per window (= n for lower, n_large for upper) - size_t n_iter = 0; // # of scalar indices iterated (= n for both regions post-C2) - size_t windows_per_batch = 0; -}; - -inline size_t optimal_window_bits_for(size_t n_points, - size_t num_bits, - size_t n_input, - size_t num_logical_threads) noexcept -{ - return static_cast( - choose_window_bits(n_points, num_bits, n_input, num_logical_threads, /*use_rebalance=*/true)); -} - -inline uint64_t predict_schedule_cost( - size_t n, size_t n_large, size_t W_lo, size_t W_hi, size_t window_bits_lo, size_t window_bits_hi, size_t T) noexcept -{ - // ALPHA_PER_WINDOW bills the per-window parallel-for dispatch + barrier overhead. - // Without it, the model under-penalises split shapes with many narrow upper windows - // (e.g. W_hi=57 / window_bits_hi=2 against a tiny n_large), which would regress real wall. - // - // Trivial-stride penalty: when B <= 2T+1, recursive_affine_bucket_reduce_strided - // short-circuits to per-window Jacobian and gives up cross-window batched-affine - // inversion amortisation. Per-pair work is similar but the per-window fixed cost - // (chunk_infos check, is_present scan, dispatch) dominates when each task only has - // 1-2 buckets per window. Bill 1.6× the bucket cost in that regime. - constexpr uint64_t ALPHA_SCAN = 1; - constexpr uint64_t ALPHA_BUCKET = 4; - constexpr uint64_t ALPHA_PER_WINDOW = 256; - constexpr uint64_t TRIVIAL_STRIDE_PENALTY_NUM = 8; // 1.6× - constexpr uint64_t TRIVIAL_STRIDE_PENALTY_DEN = 5; - auto bucket_cost_with_penalty = [T](size_t W, size_t window_bits) -> uint64_t { - if (W == 0) { - return 0; - } - const uint64_t B = (uint64_t{ 1 } << (window_bits - 1)) + 1; - const uint64_t base = static_cast(W) * B; - // Trivial-stride threshold: stride = next_pow2(⌈(B-1)/T⌉) ≤ 2 ⇔ B - 1 ≤ T (after the - // ceiling) ⇔ B ≤ T + 1 to give stride 1, or B ≤ 2T to give stride 2. The actual cutoff - // uses next_pow2 rounding: ⌈(B-1)/T⌉ ≤ 2 means (B-1) ≤ 2T, so B ≤ 2T + 1. - if (B <= 2 * static_cast(T) + 1) { - return (base * TRIVIAL_STRIDE_PENALTY_NUM) / TRIVIAL_STRIDE_PENALTY_DEN; - } - return base; - }; - const uint64_t scan_lo = static_cast(n) * W_lo; - const uint64_t scan_hi = static_cast(n_large) * W_hi; - const uint64_t scan = scan_lo + scan_hi; - const uint64_t bucket_lo = bucket_cost_with_penalty(W_lo, window_bits_lo); - const uint64_t bucket_hi = bucket_cost_with_penalty(W_hi, window_bits_hi); - const uint64_t bucket = T * (bucket_lo + bucket_hi); - const uint64_t per_window = T * ALPHA_PER_WINDOW * (W_lo + W_hi); - return (ALPHA_SCAN * scan) + (ALPHA_BUCKET * bucket) + per_window; -} - /** - * @brief Pick (b_star, window_bits_lo, window_bits_hi) for SPLIT mode. Returns is_split=false when no candidate - * on the bit-position grid beats the unsplit cost by enough margin (predicted ≤ 85% of - * unsplit) to clear the cost-model's residual variance. + * @brief Build a uniform window schedule. */ -struct VariableWindowSplitDecision { - bool is_split = false; - size_t b_star = 0; - size_t window_bits_lo = 0; - size_t window_bits_hi = 0; -}; - -inline VariableWindowSplitDecision choose_var_window_split(const std::array& msb_hist, - size_t n, - size_t num_bits, - size_t n_input, - size_t num_logical_threads) noexcept -{ - VariableWindowSplitDecision out{}; - if (n == 0 || num_bits == 0 || num_bits > 254) { - return out; - } - // msb_hist bin layout: bin 0 = zero-scalar count, bin (k+1) = scalars with msb == k. - auto cdf_ge = [&](size_t b) -> uint64_t { - uint64_t s = 0; - const size_t lo = std::min(b + 1, 256); - for (size_t i = lo; i < 256; ++i) { - s += msb_hist[i]; - } - return s; - }; - // idx_large includes scalars with msb >= b - 1 (the boundary bit needs to be included so - // the upper region cancels the negative-signed digit the lower region's last window emits). - // The cost model must see the same n_large the runtime will iterate. - auto cdf_ge_boundary = [&](size_t b) -> uint64_t { - const size_t bb = (b == 0) ? 0 : b - 1; - return cdf_ge(bb); - }; - const uint64_t n_active_u = static_cast(n) - msb_hist[0]; - const size_t window_bits_unsplit = optimal_window_bits_for(n, num_bits, n_input, num_logical_threads); - const size_t W_unsplit = (num_bits + 2 + window_bits_unsplit - 1) / window_bits_unsplit; - const uint64_t cost_unsplit = - predict_schedule_cost(n, 0, W_unsplit, 0, window_bits_unsplit, window_bits_unsplit, num_logical_threads); - - uint64_t best_cost = cost_unsplit; - size_t best_b = 0; - size_t best_window_bits_lo = 0; - size_t best_window_bits_hi = 0; - bool found = false; - - static constexpr std::array SPLIT_GRID = { 16, 32, 48, 64, 80, 96, 112, - 128, 144, 160, 176, 192, 208, 224 }; - for (size_t b : SPLIT_GRID) { - if (b == 0 || b >= num_bits) { - continue; - } - const uint64_t n_large_u = cdf_ge_boundary(b); - if (n_large_u >= n_active_u) { - continue; - } - const uint64_t n_small_active_u = n_active_u - n_large_u; - if (n_large_u == 0 || n_small_active_u == 0) { - continue; - } - // The upper region must be the minority population, the lower region must hold at - // least 10% of n, and the upper region must have enough scalars (≥ 64 absolute and - // ≥ 5% of n_active) to amortise its per-window dispatch overhead. - if (n_large_u * 2 > static_cast(n)) { - continue; - } - if (n_small_active_u * 10 < static_cast(n)) { - continue; - } - constexpr uint64_t MIN_N_LARGE_ABS = 64; - if (n_large_u < MIN_N_LARGE_ABS || n_large_u * 20 < n_active_u) { - continue; - } - // window_bits_lo's bit budget must drop materially below baseline (≥ 32 bits left for the - // upper region) for the split to be worth considering. - if (b + 32 > num_bits) { - continue; - } - const size_t window_bits_lo = optimal_window_bits_for(n, b, n_input, num_logical_threads); - const size_t window_bits_hi = - optimal_window_bits_for(static_cast(n_large_u), num_bits - b, n_input, num_logical_threads); - if (window_bits_lo == 0 || window_bits_hi == 0 || window_bits_hi >= window_bits_lo) { - continue; - } - const size_t W_lo = (b + window_bits_lo - 1) / window_bits_lo; - const size_t W_hi = ((num_bits - b) + window_bits_hi - 1) / window_bits_hi; - if (W_lo + W_hi > VAR_WINDOW_MAX_WINDOWS) { - continue; - } - const uint64_t cost = predict_schedule_cost( - n, static_cast(n_large_u), W_lo, W_hi, window_bits_lo, window_bits_hi, num_logical_threads); - if (cost < best_cost) { - best_cost = cost; - best_b = b; - best_window_bits_lo = window_bits_lo; - best_window_bits_hi = window_bits_hi; - found = true; - } - } - - // Require the predicted SPLIT cost to be ≤ 85% of unsplit, so marginal candidates inside - // the cost-model's residual variance don't fire. - if (!found || best_cost * 100 > cost_unsplit * 85) { - return out; - } - out.is_split = true; - out.b_star = best_b; - out.window_bits_lo = best_window_bits_lo; - out.window_bits_hi = best_window_bits_hi; - return out; -} - -/** - * @brief Build a VariableWindowSchedule from the split decision (or NO_SPLIT default with uniform c). - * For NO_SPLIT, all `num_windows` windows use the unsplit c; W_lo = num_windows, W_hi = 0. - * For SPLIT, the lower region uses window_bits_lo for all windows except possibly the last (which - * gets the remainder b_star - (W_lo - 1) * window_bits_lo); upper region similarly. - */ -inline VariableWindowSchedule build_var_window_schedule(const VariableWindowSplitDecision& decision, - size_t num_bits, - size_t window_bits_unsplit) noexcept -{ - VariableWindowSchedule sched{}; - - auto fill_region = [&](size_t bits_in_region, size_t window_bits_R, size_t out_offset) -> size_t { - size_t bits_remaining = bits_in_region; - size_t w = out_offset; - size_t bit_offset = (w == 0) ? 0 : sched.bit_base[w - 1] + sched.window_bits_per_window[w - 1]; - while (bits_remaining > 0) { - const size_t window_bits_w = std::min(window_bits_R, bits_remaining); - sched.bit_base[w] = static_cast(bit_offset); - sched.window_bits_per_window[w] = static_cast(window_bits_w); - sched.num_buckets[w] = static_cast((size_t{ 1 } << (window_bits_w - 1)) + 1); - bit_offset += window_bits_w; - bits_remaining -= window_bits_w; - ++w; - if (w >= VAR_WINDOW_MAX_WINDOWS) { - break; - } - } - return w - out_offset; - }; - - if (!decision.is_split) { - // NUM_BITS + 2 to match the existing num_windows formula (+2 accommodates the carry-less - // top bit of the Constantine recoder). - const size_t total_bits = num_bits + 2; - sched.W_lo = fill_region(total_bits, window_bits_unsplit, /*out_offset=*/0); - sched.W_hi = 0; - } else { - // Split region has b_star covered by window_bits_lo; remaining (num_bits + 2 - b_star) by window_bits_hi. - const size_t total_bits = num_bits + 2; - const size_t lower_bits = std::min(decision.b_star, total_bits); - sched.W_lo = fill_region(lower_bits, decision.window_bits_lo, /*out_offset=*/0); - const size_t upper_bits = total_bits - lower_bits; - if (upper_bits > 0) { - sched.W_hi = fill_region(upper_bits, decision.window_bits_hi, /*out_offset=*/sched.W_lo); - } - } - sched.num_windows = sched.W_lo + sched.W_hi; - return sched; -} - -// Forward declaration of AffineBucketChunkInfo so ThreadScratch can hold a vector of them. -struct AffineBucketChunkInfo; +// `AffineBucketChunkInfo` is defined in `pippenger_arena_layout.hpp` (included above). /** * @brief Per-thread scratch: VIEWS into the per-MSM arena. Each `std::span` is rebound at @@ -929,6 +270,45 @@ template struct ThreadScratch { std::span chunk_infos; }; +struct MsmArena { + std::unique_ptr local_owner; // NOLINT(cppcoreguidelines-avoid-c-arrays) + std::byte* data = nullptr; + uintptr_t base_addr = 0; + size_t capacity = 0; + size_t cursor = 0; + + MsmArena(size_t required_bytes, std::span external_arena) + { + if (!external_arena.empty() && required_bytes <= external_arena.size()) { + data = external_arena.data(); + capacity = external_arena.size(); + } else { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays) + local_owner = std::make_unique_for_overwrite(required_bytes); + data = local_owner.get(); + capacity = required_bytes; + } + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) + base_addr = reinterpret_cast(data); + } + + template std::span alloc(size_t count) { return bump_alloc(count, cursor, capacity, 0); } + + template std::span bump_alloc(size_t count, size_t& local_cursor, size_t bound, size_t base_offset) + { + const size_t align = alignof(T); + const uintptr_t cur_addr = base_addr + base_offset + local_cursor; + const uintptr_t aligned_addr = (cur_addr + align - 1) & ~(uintptr_t{ align } - 1); + const size_t aligned_local = static_cast(aligned_addr - (base_addr + base_offset)); + const size_t bytes = count * sizeof(T); + BB_ASSERT_LTE(aligned_local + bytes, bound); + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) + T* p = reinterpret_cast(data + base_offset + aligned_local); + local_cursor = aligned_local + bytes; + return std::span{ p, count }; + } +}; + template inline void drain_batch(ThreadScratch& s, size_t pair_count) noexcept { if (pair_count == 0) { @@ -1132,36 +512,14 @@ void reduce_chunk(ThreadScratch& s, tree_reduce_in_place(s, valid_len); } -/** - * @brief Per-window outputs of Stage 6 bucket accumulation (tree reduce + recursive affine bucket reduction). - * - * R / L are group elements; `lo` / `hi` are the lowest/highest non-empty digit in the - * chunk; `empty == 1` iff the chunk had no non-empty digits. - */ -template struct ChunkOutput { - typename Curve::Element R{}; - typename Curve::Element L{}; - uint32_t lo = 0; - uint32_t hi = 0; - uint8_t empty = 1; -}; +// `ChunkOutput` (Stage 6 per-chunk bucket-reduce output) is defined in +// `pippenger_arena_layout.hpp` so the test suite can size the Zone S slot the +// same way the live allocator does. -/** - * @brief Round-trip cell describing one chunk's contribution to the cross-window - * recursive affine bucket reduction. Filled by the densification loop and consumed - * by the four phases. - * - * `lo`, `hi` = lowest / highest non-empty digit in the chunk (inclusive). - * `buckets_padded` = next power of two ≥ (hi - lo + 1); the chunk's dense bucket layout has - * exactly this many slots, indexed 0..buckets_padded-1 (slot i = digit lo + i). - * `empty` = 1 iff the chunk had no entries (len == 0); the algorithm skips it entirely. - */ -struct AffineBucketChunkInfo { - uint32_t lo = 0; - uint32_t hi = 0; - uint32_t buckets_padded = 0; - uint8_t empty = 1; -}; +// `AffineBucketChunkInfo` is defined in `pippenger_arena_layout.hpp` (forward declared +// above at line ~674 for ThreadScratch). It describes one chunk's contribution to the +// cross-window recursive affine bucket reduction (lo/hi digit bounds, buckets_padded, +// empty flag). /** * @brief Inline filter for one (dst, src) candidate pair, called from each phase's @@ -1911,14 +1269,8 @@ inline size_t dedup_tree_reduce_in_place(typename Curve::AffineElement* pts, // All phases ≤ 4 MB regardless of input shape. The caps degrade gracefully: when hit // we leave un-deduped scalars on the standard pippenger path (still correct, just // less savings). -inline constexpr size_t DEDUP_MAX_CLUSTERS = 16384; // extra_points ≤ 1 MB -inline constexpr size_t DEDUP_MAX_MEMBERS = 32768; // total cluster member rows -// Phase A's chunked tree-reduce limit. Capped at SUBCHUNK_ENTRIES_CAP so the per-worker -// chunk_pts/chunk_ids slab matches the Stage 6a per-worker scratch and overlaps cleanly -// in later arena-layout phases. Outer-loop iteration count rises ~4× vs the historical -// 8192 cap, but the dominant amortisation (inside tree_reduce over BATCH_CAPACITY=256) -// is unaffected. -inline constexpr size_t DEDUP_MAX_CHUNK_MEMBERS = 2048; // chunk_pts ≤ 128 KB during tree-reduce +// `DEDUP_MAX_CLUSTERS`, `DEDUP_MAX_MEMBERS`, and `DEDUP_MAX_CHUNK_MEMBERS` are defined +// in `pippenger_arena_layout.hpp` so the test harness can size the matching slabs. static_assert(DEDUP_MAX_CLUSTERS <= size_t{ SCHEDULE_INDEX_MASK } + 1, "dedup extra-point ids must fit in the schedule payload"); @@ -2177,12 +1529,11 @@ size_t dedup_phase_a_worker_hash(const uint32_t* schedule_w0, } } // MSM::PhaseA/cluster_scan - // After the per-bucket loop we have `clusters_opened` total clusters, each - // with members in cluster_members starting at cluster_offsets[k]. The - // cluster_offsets vector already contains end-offsets for each cluster - // because we push_back'd at promotion time. Convert it into the same - // [start, end) layout the existing tree-reduce + publish loops expect. - const size_t num_clusters = clusters_opened; + // Only flattened clusters are published. `clusters_opened` counts every promoted + // hash-table singleton, including clusters later skipped because cluster_members_cap + // would be exceeded. Skipped clusters intentionally fall through the normal Pippenger + // path because they never get redirect_lookup entries. + const size_t num_clusters = cluster_offsets_size - 1; if (num_clusters == 0) { return 0; } @@ -2326,6 +1677,11 @@ template } } // namespace + +// PerWorkerArenaLayout (and its dependencies BATCH_CAPACITY, DEDUP_MAX_CHUNK_MEMBERS, +// AffineBucketChunkInfo) lives in `pippenger_arena_layout.hpp`. Used by the sizer +// below, the live allocator in `pippenger_round_parallel`, and the arena-layout +// regression test. } // namespace round_parallel_detail /** @@ -2456,12 +1812,10 @@ typename Curve::Element trivial_msm_threaded(PolynomialSpan -inline size_t compute_arena_bytes_for_msm(size_t n_input, - bool external_glv_provided, - bool dedup_active = false) noexcept +size_t compute_arena_bytes_for_msm(size_t n_input, bool external_glv_provided, bool dedup_active) noexcept { using ScalarField = typename Curve::ScalarField; constexpr size_t FULL_NUM_BITS = ScalarField::modulus.get_msb() + 1; @@ -2500,121 +1854,42 @@ inline size_t compute_arena_bytes_for_msm(size_t n_input, constexpr size_t BATCH_MEM_BUDGET = 32ULL * 1024ULL * 1024ULL; - const size_t dense_stride_est = std::max( - 2, std::bit_ceil((num_buckets > 1) ? ((num_buckets - 1 + num_threads - 1) / num_threads) : size_t{ 1 })); - const size_t bucket_partials_per_window_max = (num_buckets > 0) ? (num_buckets - 1 + num_threads - 1) : 0; - // num_threads sizes the per-task arrays; worker_total sizes the per-OS-thread scratch // (FIFO-shared by every task that lands on that OS thread). const size_t worker_total_for_budget = num_threads; - // HIST slot — overlays two non-coexisting lifetime classes within one byte slab per - // window: - // H (S1-S4): digit_cursors - // O (S6b-S7): chunk_outputs + window_partial_sums - // H is dead before O is born (Stage 4 cursor ends before Stage 6b first writes - // chunk_outputs / window_partial_sums). Slot per-window = max(H, O). - // - // D-class (bucket_partials_dense + bucket_partials_present) used to overlay this - // slot at the D-region offset, but a 10× interleaved WASM Chonk bench showed Stage 6a - // regressed +1.29% (t=+58) because of L1-cache aliasing on the - // `dense[slot]/present[slot]` writes when D sat at the HIST-overlaid offset (see trace - // report at https://gist.github.com/AztecBot/8cc506ff429bdf5104fa02104c0e731b). D-class - // now has its own dedicated Zone-S slot below. - // - // Match the tight calc in `pippenger_round_parallel` (which uses B_eff); here - // num_buckets is the conservative upper bound on B_eff before the SPLIT decision. - // `digit_cursors` is a single per-(w, t, d) uint32 buffer that holds three roles - // across epoch H: Stage 1 fills it with bucket counts, Stage 2 overwrites each slot - // with that bucket's exclusive prefix-sum offset, and Stage 4 advances each (w, t) - // slice in place as its scatter cursor. One buffer, three meanings — bytes are not - // duplicated. Stage 2 also writes each digit's per-window total directly into - // bucket_start_all[w][d+1] (its own Zone S slot, sized as B_eff+1 per window), so - // Stage 3 can prefix-sum in place without a separate bucket_total_counts buffer. - const size_t hist_h_bytes_pw = (size_t{ 4 } * num_threads * num_buckets); // digit_cursors - const size_t hist_o_bytes_pw = (sizeof(round_parallel_detail::ChunkOutput) * num_threads) // chunk_outputs - + (size_t{ 96 } * num_threads); // window_partial_sums - const size_t hist_slot_bytes_pw = std::max(hist_h_bytes_pw, hist_o_bytes_pw); - // DENSE slot — dedicated Zone-S slot for the D-class buffers, isolated from the HIST - // slot's offset to avoid the L1 alias hot-spot on Stage 6a scatter writes. - const size_t dense_slot_bytes_pw = - (size_t{ 65 } * bucket_partials_per_window_max); // bucket_partials_dense + bucket_partials_present - - const size_t per_window_bytes = - (size_t{ 4 } * n) // schedule - + hist_slot_bytes_pw // HIST slot (H ∪ O) - + dense_slot_bytes_pw // DENSE slot (D) - + (size_t{ 8 } * (num_buckets + 1)) // bucket_start_all - + (size_t{ 8 } * (num_threads + 1)) // chunk_start_all - + (size_t{ 8 } * (num_threads + 1)) // chunk_bucket_lo_all - + (size_t{ 8 } * num_threads) // chunk_bucket_hi_all - + (size_t{ 8 } * num_threads) // orig_thread_lo - + (size_t{ 8 } * num_threads) // orig_thread_hi - + (size_t{ 16 } * worker_total_for_budget) // chunk_infos (per-OS-thread) - + (size_t{ 8 } * num_threads) // bucket_partials_offsets - + (size_t{ 87 } * worker_total_for_budget * dense_stride_est); // s.dense_buckets + aux - - // Per-thread overflow scratch: bounded above by ceil(max_chunk_len / SUBCHUNK_CAP) - // entries, each holding a uint32_t slot index + an AffineElement. + const size_t dense_stride_est = round_parallel_detail::compute_dense_stride(num_buckets, num_threads); + + // Pre-schedule conservative per-window cost: uses `num_buckets` (= 2^(c-1)+1) as the + // B upper bound. The lambda below recomputes once the actual schedule is built. + const size_t per_window_bytes = round_parallel_detail::compute_per_window_bytes( + num_threads, num_buckets, n, dense_stride_est, worker_total_for_budget); + constexpr size_t SUBCHUNK_ENTRIES_CAP_LOCAL = 2048; - const size_t global_max_chunk_len = (n + num_threads - 1) / num_threads; const size_t global_max_overflow_per_window = - (global_max_chunk_len + SUBCHUNK_ENTRIES_CAP_LOCAL - 1) / SUBCHUNK_ENTRIES_CAP_LOCAL; - const size_t per_thread_overflow_bytes = (size_t{ 4 } + size_t{ 64 }) * global_max_overflow_per_window; - - constexpr size_t PER_THREAD_CHUNK_CAPACITY_BYTES = - (size_t{ 2048 } * size_t{ 64 }) + (size_t{ 2048 } * size_t{ 4 }) + - (size_t{ 2 } * size_t{ 256 } * size_t{ 64 }) + (size_t{ 256 } * size_t{ 32 }) + (size_t{ 256 } * size_t{ 4 }) + - size_t{ 328 }; - - // Phase 1 prologue bytes that live in the per-MSM arena (rather than on the heap): - // - msb_per_scalar : n bytes - // - glv_scalars_storage : n * 32 bytes (when use_glv) - // - glv_points_storage : n * 64 bytes (when use_glv && inline-doubling path) - // - per_thread_msb_hist : profile_threads * 1024 bytes (256 * uint32_t per thread) - // The PhaseA scratch slab (one per worker) is only allocated when dedup is active. - // See `pippenger_round_parallel` for the mirrored allocation site. + round_parallel_detail::compute_global_max_overflow_per_window(n, num_threads, SUBCHUNK_ENTRIES_CAP_LOCAL); + const bool inline_glv_double = use_glv && !external_glv_provided; const size_t profile_threads = std::max(1, bb::get_num_cpus()); - const size_t phase_one_prologue_bytes = n // msb_per_scalar - + (use_glv ? size_t{ 32 } * n : size_t{ 0 }) // glv_scalars_storage - + (inline_glv_double ? size_t{ 64 } * n : size_t{ 0 }) // glv_points_storage - + (profile_threads * size_t{ 1024 }); // per_thread_msb_hist - - // Per-worker PhaseA scratch slab (only allocated when dedup_active). Each cap is - // documented at the PhaseAScratch struct definition; they collectively cap the - // worst-case worker working set at ~160 KiB so the slab overlaps cleanly with the - // Stage 6a per-worker scratch in later arena-layout phases. - constexpr size_t PHASE_A_DIRTY_SLOTS_CAP = 4096; // HT_SIZE - constexpr size_t PHASE_A_BUCKET_REP_CAP = 256; // loose cap - constexpr size_t PHASE_A_STAGED_CAP = 1024; // loose cap - constexpr size_t PHASE_A_CHUNK_CAP = round_parallel_detail::DEDUP_MAX_CHUNK_MEMBERS; - // Per-worker cluster_members cap: n is a hard upper bound on cluster_members across - // all workers (each scalar contributes to at most one cluster_member entry), so - // min(DEDUP_MAX_MEMBERS, n) is exact and tighter than the constant for small-n MSMs. - // The publish-flatten step enforces this cap algorithmically: clusters that would - // overflow are skipped and fall through to the standard Stage 4/6a path with their - // original signed digits. - const size_t phase_a_cluster_members_cap = std::min(round_parallel_detail::DEDUP_MAX_MEMBERS, n); - // Per-worker cluster_offsets cap: clusters_opened is hard-capped at - // cids_per_thread = DEDUP_MAX_CLUSTERS / num_threads per worker; cluster_offsets - // holds clusters_opened + 1 entries. The +2 covers the leading-zero sentinel and - // the post-last terminator slot. - const size_t phase_a_cluster_offsets_cap = (round_parallel_detail::DEDUP_MAX_CLUSTERS / num_threads) + 2; - const size_t phase_a_per_worker_bytes = (size_t{ 4 } * phase_a_cluster_members_cap) // cluster_members (uint32) - + (size_t{ 4 } * phase_a_cluster_offsets_cap) // cluster_offsets (uint32) - + (size_t{ 2 } * PHASE_A_DIRTY_SLOTS_CAP) // dirty_slots (uint16) - + (size_t{ 4 } * PHASE_A_BUCKET_REP_CAP) // bucket_rep (uint32) - + (size_t{ 8 } * PHASE_A_STAGED_CAP) // staged (pair) - + (sizeof(typename Curve::AffineElement) * PHASE_A_CHUNK_CAP) // chunk_pts - + (size_t{ 4 } * PHASE_A_CHUNK_CAP); // chunk_ids - - // Zone W per-worker UNION: ThreadScratch's wpb-independent fields and PhaseAScratch - // overlay the SAME per-worker bytes (Stage 6a, Stage 6b, and Phase A run in disjoint - // parallel_for invocations on each worker). The union size is the max of either layout, - // not the sum — see the Arena zone layout block in `pippenger_round_parallel`. - const size_t ts_fixed_bytes = PER_THREAD_CHUNK_CAPACITY_BYTES + per_thread_overflow_bytes; - const size_t worker_union_bytes = - dedup_active ? std::max(ts_fixed_bytes, phase_a_per_worker_bytes) : ts_fixed_bytes; + const size_t phase_one_prologue_bytes = + round_parallel_detail::compute_phase_one_prologue_bytes(n, use_glv, inline_glv_double, profile_threads); + + const auto phase_a_caps = round_parallel_detail::compute_phase_a_caps(n, num_threads); + const size_t phase_a_cluster_members_cap = phase_a_caps.members_cap; + const size_t phase_a_cluster_offsets_cap = phase_a_caps.offsets_cap; + + // Zone W per-worker UNION via the canonical layout walk. Stage 6a, Stage 6b, and + // Phase A overlay the same per-worker bytes; the struct returns the max-of-layouts + // (the Stage 6 wpb-dependent tail is added below once `windows_per_batch` is known). + // Passing `windows_per_batch = 0` here skips the tail — we only need the union bytes + // for the fixed_overhead → wpb solve. + const round_parallel_detail::PerWorkerArenaLayout union_layout(/*chunk_capacity=*/SUBCHUNK_ENTRIES_CAP_LOCAL, + global_max_overflow_per_window, + dedup_active, + phase_a_cluster_members_cap, + phase_a_cluster_offsets_cap, + /*windows_per_batch=*/0, + /*dense_stride_est=*/0); + const size_t worker_union_bytes = union_layout.per_worker_union_bytes; const size_t fixed_overhead = (worker_union_bytes * worker_total_for_budget) + (size_t{ 96 } * round_parallel_detail::VAR_WINDOW_MAX_WINDOWS) // window_sums_storage @@ -2622,29 +1897,39 @@ inline size_t compute_arena_bytes_for_msm(size_t n_input, + phase_one_prologue_bytes; // wpb fallback when fixed_overhead has eaten the BATCH_MEM_BUDGET headroom: the inline - // `pick_wpb` in `pippenger_round_parallel` returns `W_R` (the whole region) — running - // every window in a single batch — when `available_budget == 0`. The arena sizer must - // mirror that fallback exactly or the post-decision (P + W + S) cursor overflows the - // pre-Phase-1 buffer. Previously this branch returned `wpb = 1` and relied on a - // `worst_case_arena = BATCH_MEM_BUDGET + 32K` floor, but that floor is wrong: with large - // num_threads the fixed_overhead alone already exceeds BATCH_MEM_BUDGET and the floor - // does not cover `fixed_overhead + num_windows * per_window_bytes`. Bumping wpb to - // num_windows here makes the conservative_arena formula track the inline path's tight - // calc to within the per_window_bytes alignment slop. - size_t windows_per_batch = 0; - if (BATCH_MEM_BUDGET <= fixed_overhead) { - windows_per_batch = num_windows; - } else { - const size_t available_budget = BATCH_MEM_BUDGET - fixed_overhead; - windows_per_batch = std::max(1, available_budget / per_window_bytes); - } - windows_per_batch = std::min(windows_per_batch, num_windows); + // `solve_wpb` in `pippenger_round_parallel` returns `W_R` (the whole region) — running + // every window in a single batch — when `available_budget == 0`. Previously the sizer + // returned `wpb = 1` and relied on a `worst_case_arena = BATCH_MEM_BUDGET + 32K` floor; + // that floor failed for large num_threads where fixed_overhead alone exceeds the budget. + const size_t available_budget_outer = + (BATCH_MEM_BUDGET > fixed_overhead) ? (BATCH_MEM_BUDGET - fixed_overhead) : size_t{ 0 }; + const size_t windows_per_batch = + round_parallel_detail::solve_wpb(per_window_bytes, available_budget_outer, num_windows); // Dedup state lives in the arena (allocated post-Phase-1, retained through Stage 6a). // Worst-case sizes: redirect_lookup is one uint32 per working scalar (4n bytes); // extra_points is the fixed DEDUP_MAX_CLUSTERS cap (≈1 MB) regardless of n. const size_t dedup_bytes = dedup_active ? ((size_t{ 4 } * n) + (size_t{ sizeof(typename Curve::AffineElement) } * round_parallel_detail::DEDUP_MAX_CLUSTERS)) : size_t{ 0 }; + auto arena_bytes_for_window_layout = [&](size_t bit_budget) { + const size_t wb = round_parallel_detail::choose_window_bits( + n, bit_budget, n_input, num_logical_threads_for_c, /*use_rebalance=*/true); + const auto layout_sched = round_parallel_detail::build_var_window_schedule(bit_budget, wb); + size_t B_eff_layout = (size_t{ 1 } << (wb - 1)) + 1; + for (size_t w = 0; w < layout_sched.num_windows; ++w) { + B_eff_layout = std::max(B_eff_layout, static_cast(layout_sched.num_buckets[w])); + } + const size_t dense_stride_layout = round_parallel_detail::compute_dense_stride(B_eff_layout, num_threads); + const size_t per_window_bytes_layout = round_parallel_detail::compute_per_window_bytes( + num_threads, B_eff_layout, n, dense_stride_layout, worker_total_for_budget); + + const size_t available_budget = + (BATCH_MEM_BUDGET > fixed_overhead) ? (BATCH_MEM_BUDGET - fixed_overhead) : size_t{ 0 }; + const size_t wpb = round_parallel_detail::solve_wpb( + per_window_bytes_layout, available_budget, static_cast(layout_sched.num_windows)); + return fixed_overhead + (wpb * per_window_bytes_layout) + 32768 + dedup_bytes; + }; + // Tight return: the arena holds `fixed_overhead + wpb · per_window_bytes` of typed // buffers plus a 32 KiB alignment pad and the dedup state (when active). Sizing // tightly — rather than padding up to BATCH_MEM_BUDGET — matters for many-MSM flows @@ -2652,10 +1937,19 @@ inline size_t compute_arena_bytes_for_msm(size_t n_input, // `make_unique_for_overwrite` mmap/munmaps the buffer above glibc's // M_MMAP_THRESHOLD; a 32 MiB floor here would tax every MSM with the page-fault // first-touch cost regardless of how much of the arena the small MSM actually uses. - return fixed_overhead + (windows_per_batch * per_window_bytes) + 32768 + dedup_bytes; -} + size_t arena_bytes = fixed_overhead + (windows_per_batch * per_window_bytes) + 32768 + dedup_bytes; -} // namespace + // The live pipeline shrinks NUM_BITS to the observed max scalar bit before choosing + // window_bits. GLV MSMs and large non-GLV MSMs can therefore select a different + // schedule/zone layout than the full-bit pre-sizer. Keep the common Chonk wire/IPA + // non-GLV sizes on the original tight path. + if (use_glv || n_input >= (size_t{ 1 } << 17)) { + for (size_t bit_budget = 1; bit_budget <= NUM_BITS; ++bit_budget) { + arena_bytes = std::max(arena_bytes, arena_bytes_for_window_layout(bit_budget)); + } + } + return arena_bytes; +} // Round-parallel Pippenger MSM. // `external_glv_doubled` — optional caller-supplied [P_0, φP_0, …, P_{n-1}, φP_{n-1}] @@ -2714,15 +2008,15 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan scalars; std::span points; - const bool inline_glv_double = use_glv && external_glv_doubled.empty(); + const bool inline_glv_double = use_glv && !external_glv_provided; // Activation gate: caller-supplied hint opts this MSM into the dedup pre-pass. // Hint-driven so polynomials with low duplicate density (PC counters, range checks) @@ -2744,61 +2038,21 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan(n_input, !external_glv_doubled.empty(), dedup_active); - std::unique_ptr local_arena_owner; // NOLINT(cppcoreguidelines-avoid-c-arrays) - std::byte* arena_data = nullptr; - size_t arena_capacity = 0; - if (!external_arena.empty() && arena_total_bytes <= external_arena.size()) { - arena_data = external_arena.data(); - arena_capacity = external_arena.size(); - } else { - // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays) - local_arena_owner = std::make_unique_for_overwrite(arena_total_bytes); - arena_data = local_arena_owner.get(); - arena_capacity = arena_total_bytes; - } - // make_unique_for_overwrite only guarantees __STDCPP_DEFAULT_NEW_ALIGNMENT__ - // (typically 16 on x86_64), but Element / AffineElement are alignas(32) / alignas(64). - // Aligning the cursor isn't enough — the resulting pointer inherits the base's - // misalignment — so align in absolute address space. AVX vmovdqa against an Element* - // allocation otherwise raises #GP / SIGSEGV when the base is only 16-byte aligned. - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) - const auto arena_base_addr = reinterpret_cast(arena_data); - // The bump cursor below allocates the Phase 1 prologue slabs (Zone P prefix). Once - // Phase 1 finishes and the var-window split decision is made (T, B_eff, dense_stride, - // wpb), we freeze the prologue cursor and partition the remaining arena into named - // zones — see the Arena zone layout block further down. - size_t arena_cursor = 0; - auto bump_alloc_within = - [&](size_t count, size_t& cursor, size_t bound_bytes, size_t base_offset) -> std::span { - const size_t align = alignof(T); - const uintptr_t cur_addr = arena_base_addr + base_offset + cursor; - const uintptr_t aligned_addr = (cur_addr + align - 1) & ~(uintptr_t{ align } - 1); - const size_t aligned_local = static_cast(aligned_addr - (arena_base_addr + base_offset)); - const size_t bytes = count * sizeof(T); - BB_ASSERT_LTE(aligned_local + bytes, bound_bytes); - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) - T* p = reinterpret_cast(arena_data + base_offset + aligned_local); - cursor = aligned_local + bytes; - return std::span{ p, count }; - }; - auto arena_alloc = [&](size_t count) -> std::span { - return bump_alloc_within.template operator()(count, arena_cursor, arena_capacity, 0); - }; + const size_t arena_total_bytes = compute_arena_bytes_for_msm(n_input, external_glv_provided, dedup_active); + round_parallel_detail::MsmArena arena(arena_total_bytes, external_arena); // --------------------------------------------------------------------------------------- // Phase 1 — convert scalars from Montgomery, optionally GLV-split, populate msb buffer. - // The msb_per_scalar buffer feeds Item 1 (max-msb num_windows) and idx_large building; + // The msb_per_scalar buffer feeds max-msb num_windows selection; // per-thread msb_hist counts (bin 0 = zero, bin k+1 = msb == k) feed the n_active gate - // and the cost model in choose_split. + // and the active-scalar gate. // // When dedup is active the per-scalar dedup work (hash + linear-probe shared atomic // table, per-thread dup_pair recording) is fused into the same per-thread loop so @@ -2808,9 +2062,9 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan(1, bb::get_num_cpus()); - auto msb_per_scalar = arena_alloc.template operator()(n); - auto per_thread_msb_hist = arena_alloc.template operator()>(profile_threads); - // arena_alloc returns uninitialised memory; the histograms must be zero-initialised so + auto msb_per_scalar = arena.template alloc(n); + auto per_thread_msb_hist = arena.template alloc>(profile_threads); + // MsmArena::alloc returns uninitialised memory; the histograms must be zero-initialised so // record_msb's increments land on a clean slate. std::fill_n(per_thread_msb_hist.data(), profile_threads, std::array{}); @@ -2820,9 +2074,9 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan glv_scalars_storage; std::span glv_points_storage; if (use_glv) { - glv_scalars_storage = arena_alloc.template operator()(n); + glv_scalars_storage = arena.template alloc(n); if (inline_glv_double) { - glv_points_storage = arena_alloc.template operator()(n); + glv_points_storage = arena.template alloc(n); } else { BB_ASSERT_EQ(external_glv_doubled.size(), n); } @@ -2911,14 +2165,12 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan 1;) { --bin; @@ -2935,9 +2187,8 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan dedup_state; - // choose_var_window_split returns is_split=false for inputs that don't beat the unsplit cost - // model — the typical NO_SPLIT path then degenerates to a single-region uniform-window schedule. - auto var_window_decision = round_parallel_detail::choose_var_window_split( - msb_hist, n, effective_num_bits, n_input, num_logical_threads_for_c); - if (const char* force = std::getenv("VAR_WINDOW_FORCE_SPLIT")) { - size_t fb = 0; - size_t force_window_bits_lo = 0; - size_t force_window_bits_hi = 0; - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg, cert-err34-c, hicpp-vararg) — debug env-var parse - if (std::sscanf(force, "%zu,%zu,%zu", &fb, &force_window_bits_lo, &force_window_bits_hi) == 3 && fb > 0 && - fb < effective_num_bits) { - var_window_decision.is_split = true; - var_window_decision.b_star = fb; - var_window_decision.window_bits_lo = force_window_bits_lo; - var_window_decision.window_bits_hi = force_window_bits_hi; - } - } - // SPLIT iterates [0, n) in both regions with a per-region msb filter at the Stage 1 / - // Stage 4 inner loop — no idx_large vector is materialised. `upper_iter_threshold_msb` - // is captured here, BEFORE Item 4 may override `var_window_decision.b_star`: Item 4 collapses - // W_lo=0 but the upper region must still iterate the *original* large-scalar set. - // - // Threshold is msb >= b_star - 1 (NOT >= b_star). The Booth recoder shares bit - // (b_star - 1) between the lower region's last window and the upper region's first; - // excluding msb == b_star - 1 leaves the lower window's negative-signed digit - // uncancelled and the result drifts by 2^b_star. - size_t upper_iter_threshold_msb = 0; - size_t n_large = 0; - if (var_window_decision.is_split) { - const size_t b_star = var_window_decision.b_star; - upper_iter_threshold_msb = (b_star == 0) ? 0 : b_star - 1; - // n_large = scalars with msb >= upper_iter_threshold_msb. The msb_hist bin layout - // is bin (k+1) = scalars with msb == k (bin 0 = zero count). The boundary inclusion - // criterion msb >= b_star - 1 ⇔ msb_bin >= b_star, so we sum bins [b_star..255]. - // For b_star == 0 (no constraint), we sum bins [1..255] = all non-zero scalars. - const size_t lo_bin = (b_star == 0) ? 1 : b_star; - for (size_t b = lo_bin; b < 256; ++b) { - n_large += static_cast(msb_hist[b]); - } - } - - // If SPLIT fired but idx_small's per-thread slice is too thin to amortise pippenger's - // per-window pipeline, peel idx_small off into a straus_msm partial sum and rewrite the - // schedule as "idx_large only, full bit coverage" (b_star = 0 collapses W_lo to 0). - Element peeled_small_partial = Curve::Group::point_at_infinity; - bool peeled_small_active = false; - if (var_window_decision.is_split) { - BB_ASSERT_LTE(n_large, n_active_early); - const size_t n_small = n_active_early - n_large; - const size_t max_threads_for_check = bb::get_num_cpus(); - const size_t threads_for_check = std::max(1, std::min(n_small, max_threads_for_check)); - const size_t small_pts_per_thread = (n_small + threads_for_check - 1) / threads_for_check; - if (n_small > 0 && small_pts_per_thread < MIN_PTS_PER_THREAD_FOR_PIPPENGER) { - const size_t b_star_orig = var_window_decision.b_star; - const size_t threshold_orig = (b_star_orig == 0) ? 0 : b_star_orig - 1; - std::vector small_scalars_mont; - std::vector small_points; - small_scalars_mont.reserve(n_small); - small_points.reserve(n_small); - for (size_t i = 0; i < n; ++i) { - const uint8_t m = msb_per_scalar[i]; - if (m == MSB_ZERO_SENTINEL || static_cast(m) >= threshold_orig) { - continue; - } - // Honour dedup redirect for small-set scalars: cluster reps fetch from - // `extra_points[cid]` (the combined cluster point — must use the aggregate - // here, otherwise the small-set peel silently drops every duplicate's - // contribution). Non-reps are skipped — their points are already inside - // the rep's aggregate, processing them again would double-count. - // Phase A runs LATER in the pipeline (inside the batch loop, after arena - // setup), so when this peel runs `redirect_lookup` is unallocated and the - // dedup branch falls through to the plain path. The empty() guard is what - // makes that fall-through safe. - if (dedup_active && !dedup_state.redirect_lookup.empty()) { - const uint32_t r = dedup_state.redirect_lookup[i]; - if (r != round_parallel_detail::DEDUP_INVALID_EXTRA) { - if ((r & round_parallel_detail::DEDUP_SKIP_BIT) != 0) { - continue; - } - ScalarField s = scalars[i]; - s.self_to_montgomery_form(); - small_scalars_mont.push_back(s); - small_points.push_back( - dedup_state.extra_points[r & round_parallel_detail::SCHEDULE_INDEX_MASK]); - continue; - } - } - ScalarField s = scalars[i]; - s.self_to_montgomery_form(); - small_scalars_mont.push_back(s); - small_points.push_back(points[i]); - } - std::span sscs(small_scalars_mont.data(), small_scalars_mont.size()); - std::span spts(small_points.data(), small_points.size()); - PolynomialSpan ssp(0, sscs); - peeled_small_partial = trivial_msm_threaded(ssp, spts); - peeled_small_active = true; - - const size_t window_bits_large = round_parallel_detail::optimal_window_bits_for( - n_large, effective_num_bits, n_input, num_logical_threads_for_c); - var_window_decision.is_split = true; - var_window_decision.b_star = 0; - var_window_decision.window_bits_lo = window_bits_large; - var_window_decision.window_bits_hi = window_bits_large; - } - } - - const auto sched = - round_parallel_detail::build_var_window_schedule(var_window_decision, effective_num_bits, window_bits); + // Variable-window split was removed from the production path after Chonk traces showed + // it regressing this rewrite. Keep the schedule uniform and run one region over all + // non-zero scalars. + const auto sched = round_parallel_detail::build_var_window_schedule(effective_num_bits, window_bits); BB_ASSERT_LTE(sched.num_windows, round_parallel_detail::VAR_WINDOW_MAX_WINDOWS, - "variable-window schedule exceeds compile-time max window count"); + "window schedule exceeds compile-time max window count"); using round_parallel_detail::BATCH_CAPACITY; constexpr size_t MIN_BATCH_CAPACITY = 32; @@ -3084,168 +2230,54 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan window_bits_unsplit when the lower region's bit budget makes a wider window - // optimal, so we can't assume num_buckets is the maximum. + // Per-(w, t) slot stride must fit the widest schedule window. size_t B_eff = num_buckets; for (size_t w = 0; w < sched.num_windows; ++w) { B_eff = std::max(B_eff, static_cast(sched.num_buckets[w])); } - // s.dense_buckets stride upper bound. Used both for the budget calculation and the - // arena allocation. Stage 6 always rebalances now: stride = next_pow2(⌈(B-1)/T⌉) - // where each Stage-6b task owns a uniform bucket-index slice. - const size_t dense_stride_est = - std::max(2, std::bit_ceil((B_eff > 1) ? ((B_eff - 1 + num_threads - 1) / num_threads) : size_t{ 1 })); - // Σ_t buckets_per_thread[t][w] per window. Each thread's slice covers a contiguous - // bucket-index range; adjacent threads may share a boundary bucket (counted twice - // in the sum). With T threads and T−1 possible shared boundaries, the sum is at - // most B + T − 1. For typical (uniform-random) scalar distributions, the sum is - // ≈ B; this bound is tight to within T. - const size_t bucket_partials_per_window_max = (B_eff > 0) ? (B_eff - 1 + num_threads - 1) : 0; - - // Per-region per-window bytes — schedule capacity differs by region. Lower iterates - // [0, n) directly so capacity_lo = n; upper also iterates [0, n) but only n_large - // entries pass the msb-threshold filter, so capacity_hi = n_large is a tight upper - // bound on the number of schedule entries Stage 4 will emit per upper-region window. - // Other per-window dimensions (digit_cursors/B_eff, bucket_partials, dense_buckets - // stride) are shared across regions and use B_eff. We size each region's - // per_window_bytes accurately so windows_per_batch_R can be picked per-region; the - // upper region can fit MUCH more windows per batch when n_large << n. const size_t worker_total_for_budget = num_threads; - // HIST slot — two non-coexisting lifetime classes share one byte slab per window: - // H (S1-S4): digit_cursors - // O (S6b-S7): chunk_outputs + window_partial_sums - // H dies before O is born (Stage 4's cursor advance ends before Stage 6b first writes - // chunk_outputs / window_partial_sums). Slot per-window = max(H, O). - // - // D-class (bucket_partials_dense + bucket_partials_present) used to overlay this - // slot at the D-region offset, but a 10× interleaved WASM Chonk bench showed Stage 6a - // regressed +1.29% (t=+58) due to L1 cache aliasing on the `dense[slot]/present[slot]` - // scatter writes when D sat at the HIST-overlaid offset (trace report: - // https://gist.github.com/AztecBot/8cc506ff429bdf5104fa02104c0e731b). D-class has its - // own dedicated Zone-S DENSE slot below; HIST keeps only H ↔ O. - // - // The single `digit_cursors` buffer carries the per-(w, t, d) Stage 1 counts AND the - // Stage 2 prefix-sum offsets (Stage 2 overwrites each slot with the offset Stage 4 - // needs as a cursor), so H sizes as one uint32 per (w, t, d). Phase 5 additionally - // folds the per-window per-digit totals into bucket_start_all[w][d+1] (its own Zone S - // slot, sized B_eff+1 per window) so Stage 3 can prefix-sum in place without a - // separate bucket_total_counts buffer. At chonk (T=32, c=12, B_eff=2049): - // H ≈ 4·32·2049 ≈ 256 KiB/window - // O ≈ (sizeof(ChunkOutput)+96)·32 ≈ 5 KiB/window - // D ≈ 65·2080 ≈ 135 KiB/window (in its own DENSE slot) - // so HIST_SLOT ≈ 256 KiB/window — H-bound. Per-window swing total grows by - // D_pw - max(0, D_pw - (H_pw - O_pw)) ≈ 135 KiB/window vs the pre-fix layout; this - // additional swing is paid for by isolating the Stage 6a scatter from the H/O bytes. - const size_t hist_h_bytes_pw_shared = (size_t{ 4 } * num_threads * B_eff); // digit_cursors - const size_t hist_o_bytes_pw_shared = - (sizeof(round_parallel_detail::ChunkOutput) * num_threads) // chunk_outputs - + (size_t{ 96 } * num_threads); // window_partial_sums - const size_t hist_slot_bytes_pw_shared = std::max(hist_h_bytes_pw_shared, hist_o_bytes_pw_shared); - // DENSE slot — dedicated Zone-S slot for the D-class buffers, isolated from the HIST - // slot's offset to avoid the L1 alias hot-spot on Stage 6a scatter writes. - const size_t dense_slot_bytes_pw_shared = - (size_t{ 65 } * bucket_partials_per_window_max); // bucket_partials_dense + bucket_partials_present - - const size_t per_window_bytes_shared = - hist_slot_bytes_pw_shared // HIST slot (H ∪ O) - + dense_slot_bytes_pw_shared // DENSE slot (D) - + (size_t{ 8 } * (B_eff + 1)) // bucket_start_all - + (size_t{ 8 } * (num_threads + 1)) // chunk_start_all - + (size_t{ 8 } * (num_threads + 1)) // chunk_bucket_lo_all - + (size_t{ 8 } * num_threads) // chunk_bucket_hi_all - + (size_t{ 8 } * num_threads) // orig_thread_lo - + (size_t{ 8 } * num_threads) // orig_thread_hi - + (size_t{ 16 } * worker_total_for_budget) // chunk_infos - + (size_t{ 8 } * num_threads) // bucket_partials_offsets - + (size_t{ 87 } * worker_total_for_budget * dense_stride_est); // s.dense_buckets + aux - - // Per-region schedule contribution: capacity_R uint32 entries per (window, region). - const size_t capacity_lo = n; // lower iterates [0, n) - const size_t capacity_hi = (sched.W_hi > 0) ? n_large : size_t{ 0 }; // upper emits at most n_large entries - const size_t per_window_bytes_lo = (size_t{ 4 } * capacity_lo) + per_window_bytes_shared; - const size_t per_window_bytes_hi = (size_t{ 4 } * capacity_hi) + per_window_bytes_shared; - - constexpr size_t PER_THREAD_CHUNK_CAPACITY_BYTES = - // SUBCHUNK_ENTRIES_CAP=2048, BATCH_CAPACITY=256: - (size_t{ 2048 } * size_t{ 64 }) // curr_pts (AffineElement) = 131072 - + (size_t{ 2048 } * size_t{ 4 }) // curr_buckets (uint32_t) = 8192 - + (size_t{ 2 } * size_t{ 256 } * size_t{ 64 }) // points_to_add = 32768 - + (size_t{ 256 } * size_t{ 32 }) // inversion_scratch (BaseField) = 8192 - + (size_t{ 256 } * size_t{ 4 }) // pair_dest (uint32_t) = 1024 - + size_t{ 328 }; // ThreadScratch struct overhead = 328 - // Per-OS-thread Stage 6a seam overflow scratch: at most ceil(max_chunk_len / SUBCHUNK_CAP) - // entries × (uint32 slot index + AffineElement). Scales with logical-task chunk size, - // not OS-thread count. - const size_t global_max_chunk_len_for_budget = (n + num_threads - 1) / num_threads; + const size_t dense_stride_est = round_parallel_detail::compute_dense_stride(B_eff, num_threads); + const size_t bucket_partials_per_window_max = + round_parallel_detail::compute_bucket_partials_max(B_eff, num_threads); + const size_t capacity_lo = n; + const size_t per_window_bytes_lo = round_parallel_detail::compute_per_window_bytes( + num_threads, B_eff, n, dense_stride_est, worker_total_for_budget); + const size_t global_max_overflow_per_window_for_budget = - (global_max_chunk_len_for_budget + SUBCHUNK_ENTRIES_CAP - 1) / SUBCHUNK_ENTRIES_CAP; - const size_t per_thread_overflow_bytes = (size_t{ 4 } + size_t{ 64 }) * global_max_overflow_per_window_for_budget; - - // Phase 1 prologue bytes living in the per-MSM arena — mirrors the formula in - // `compute_arena_bytes_for_msm`. Anyone adding a per-MSM arena buffer must update both - // sites or `windows_per_batch` drifts off the BATCH_MEM_BUDGET. - const size_t phase_one_prologue_bytes = n // msb_per_scalar - + (use_glv ? size_t{ 32 } * n : size_t{ 0 }) // glv_scalars_storage - + (inline_glv_double ? size_t{ 64 } * n : size_t{ 0 }) // glv_points_storage - + (profile_threads * size_t{ 1024 }); // per_thread_msb_hist - - // Per-worker PhaseA scratch slab (one per worker, allocated only when dedup_active). - // See `round_parallel_detail::PhaseAScratch` for cap rationale. - constexpr size_t PHASE_A_DIRTY_SLOTS_CAP = 4096; - constexpr size_t PHASE_A_BUCKET_REP_CAP = 256; - constexpr size_t PHASE_A_STAGED_CAP = 1024; - constexpr size_t PHASE_A_CHUNK_CAP = round_parallel_detail::DEDUP_MAX_CHUNK_MEMBERS; - // Per-worker cluster_members cap: n is a hard upper bound on cluster_members across - // all workers (each scalar contributes to at most one cluster_member entry), so - // min(DEDUP_MAX_MEMBERS, n) is exact and tighter than the constant for small-n MSMs. - // The publish-flatten step enforces this cap algorithmically: clusters that would - // overflow are skipped and fall through to the standard Stage 4/6a path with their - // original signed digits. - const size_t phase_a_cluster_members_cap = std::min(round_parallel_detail::DEDUP_MAX_MEMBERS, n); - // Per-worker cluster_offsets cap: clusters_opened is hard-capped at - // cids_per_thread = DEDUP_MAX_CLUSTERS / num_threads per worker; cluster_offsets - // holds clusters_opened + 1 entries. The +2 covers the leading-zero sentinel and - // the post-last terminator slot. - const size_t phase_a_cluster_offsets_cap = (round_parallel_detail::DEDUP_MAX_CLUSTERS / num_threads) + 2; - const size_t phase_a_per_worker_bytes = - (size_t{ 4 } * phase_a_cluster_members_cap) + (size_t{ 4 } * phase_a_cluster_offsets_cap) + - (size_t{ 2 } * PHASE_A_DIRTY_SLOTS_CAP) + (size_t{ 4 } * PHASE_A_BUCKET_REP_CAP) + - (size_t{ 8 } * PHASE_A_STAGED_CAP) + (sizeof(AffineElement) * PHASE_A_CHUNK_CAP) + - (size_t{ 4 } * PHASE_A_CHUNK_CAP); - - // Zone W per-worker UNION (see Arena zone layout block below). Stage 6a / Stage 6b - // ThreadScratch fixed fields and PhaseAScratch overlay the SAME per-worker bytes; the - // worker's slab consumes max(ts_fixed, phase_a) bytes, not the sum, because the three - // stages run in disjoint parallel_for invocations on each worker. - const size_t ts_fixed_bytes = PER_THREAD_CHUNK_CAPACITY_BYTES + per_thread_overflow_bytes; - const size_t worker_union_bytes_for_budget = - dedup_active ? std::max(ts_fixed_bytes, phase_a_per_worker_bytes) : ts_fixed_bytes; + round_parallel_detail::compute_global_max_overflow_per_window(n, num_threads, SUBCHUNK_ENTRIES_CAP); + + const size_t phase_one_prologue_bytes = + round_parallel_detail::compute_phase_one_prologue_bytes(n, use_glv, inline_glv_double, profile_threads); + + const auto phase_a_caps = round_parallel_detail::compute_phase_a_caps(n, num_threads); + const size_t phase_a_cluster_members_cap = phase_a_caps.members_cap; + const size_t phase_a_cluster_offsets_cap = phase_a_caps.offsets_cap; + + // Zone W per-worker UNION via the canonical layout walk. The wpb-dependent Stage 6 + // tail is added separately after `windows_per_batch` is solved; here we only need + // the union bytes for the fixed_overhead → wpb budget. + const round_parallel_detail::PerWorkerArenaLayout budget_layout( + /*chunk_capacity=*/SUBCHUNK_ENTRIES_CAP, + global_max_overflow_per_window_for_budget, + dedup_active, + phase_a_cluster_members_cap, + phase_a_cluster_offsets_cap, + /*windows_per_batch=*/0, + /*dense_stride_est=*/0); + const size_t worker_union_bytes_for_budget = budget_layout.per_worker_union_bytes; const size_t fixed_overhead = (worker_union_bytes_for_budget * worker_total_for_budget) + (size_t{ 96 } * round_parallel_detail::VAR_WINDOW_MAX_WINDOWS) // window_sums_storage + (size_t{ 8 } * (num_threads + 1)) // rebalanced_bucket_lo_partition + phase_one_prologue_bytes; - // Solve `wpb_R · per_window_bytes_R ≤ BATCH_MEM_BUDGET − fixed_overhead` per region. - // For sparse upper regions per_window_bytes_hi is much smaller so wpb_hi can be much - // bigger, fitting the entire upper region in one batch and amortising parallel_for - // dispatch over the whole region. NO_SPLIT runs only the lower region (W_hi = 0). + // Solve `wpb · per_window_bytes ≤ BATCH_MEM_BUDGET − fixed_overhead`. const size_t available_budget = (BATCH_MEM_BUDGET > fixed_overhead) ? (BATCH_MEM_BUDGET - fixed_overhead) : size_t{ 0 }; - auto pick_wpb = [&](size_t per_window_bytes_R, size_t W_R) -> size_t { - if (W_R == 0) { - return 1; - } - if (per_window_bytes_R == 0 || available_budget == 0) { - return std::max(1, W_R); - } - return std::min(std::max(1, available_budget / per_window_bytes_R), W_R); - }; - const size_t windows_per_batch_lo = pick_wpb(per_window_bytes_lo, sched.W_lo); - const size_t windows_per_batch_hi = pick_wpb(per_window_bytes_hi, sched.W_hi); - const size_t windows_per_batch = std::max(windows_per_batch_lo, windows_per_batch_hi); + const size_t windows_per_batch_lo = + round_parallel_detail::solve_wpb(per_window_bytes_lo, available_budget, sched.num_windows); + const size_t windows_per_batch = windows_per_batch_lo; // Per-thread chunk-capacity scratch sizing. A thread's per-window slice is split into // sub-chunks of at most SUBCHUNK_ENTRIES_CAP entries. Worst-case overflow per @@ -3274,7 +2306,7 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan size_t { return (off + align - 1) & ~(align - 1); }; auto layout_add = [&](size_t& off, size_t bytes, size_t align) { off = align_up(off, align) + bytes; }; - // ThreadScratch fixed (curr_pts/curr_buckets/points_to_add/inversion_scratch/pair_dest/ - // overflow_slots/overflow_pts). Mirrors the alloc order below. - size_t ts_fixed_layout = 0; - layout_add(ts_fixed_layout, sizeof(AffineElement) * chunk_capacity, alignof(AffineElement)); - layout_add(ts_fixed_layout, sizeof(uint32_t) * chunk_capacity, alignof(uint32_t)); - layout_add(ts_fixed_layout, sizeof(AffineElement) * 2 * BATCH_CAPACITY, alignof(AffineElement)); - layout_add(ts_fixed_layout, sizeof(BaseField) * BATCH_CAPACITY, alignof(BaseField)); - layout_add(ts_fixed_layout, sizeof(uint32_t) * BATCH_CAPACITY, alignof(uint32_t)); - layout_add(ts_fixed_layout, sizeof(uint32_t) * global_max_overflow_per_window, alignof(uint32_t)); - layout_add(ts_fixed_layout, sizeof(AffineElement) * global_max_overflow_per_window, alignof(AffineElement)); - - // PhaseA layout (cluster_members/cluster_offsets/dirty_slots/bucket_rep/staged/chunk_pts/chunk_ids). - size_t pa_layout = 0; - if (dedup_active) { - layout_add(pa_layout, sizeof(uint32_t) * phase_a_cluster_members_cap, alignof(uint32_t)); - layout_add(pa_layout, sizeof(uint32_t) * phase_a_cluster_offsets_cap, alignof(uint32_t)); - layout_add(pa_layout, sizeof(uint16_t) * PHASE_A_DIRTY_SLOTS_CAP, alignof(uint16_t)); - layout_add(pa_layout, sizeof(uint32_t) * PHASE_A_BUCKET_REP_CAP, alignof(uint32_t)); - layout_add(pa_layout, - sizeof(std::pair) * PHASE_A_STAGED_CAP, - alignof(std::pair)); - layout_add(pa_layout, sizeof(AffineElement) * PHASE_A_CHUNK_CAP, alignof(AffineElement)); - layout_add(pa_layout, sizeof(uint32_t) * PHASE_A_CHUNK_CAP, alignof(uint32_t)); - } - - // Per-worker union: ThreadScratch fixed and PhaseA overlay the same bytes. Stage 6's - // wpb-dependent fields (dense_buckets / is_present / pair scratch / chunk_infos) sit - // immediately after the union, so each worker's slab = union + wpb-dependent tail. - // Use the worst-case AffineElement alignment between regions to avoid mid-slab - // misalignment when the next worker begins. - constexpr size_t WORKER_SLAB_ALIGN = alignof(AffineElement); - const size_t per_worker_union_bytes = align_up(std::max(ts_fixed_layout, pa_layout), WORKER_SLAB_ALIGN); - - // wpb-dependent per-worker tail (Stage 6 only — PhaseA has no per-wpb part). - size_t per_worker_per_wpb_layout = 0; - { - const size_t dense_total = windows_per_batch * dense_stride_est; - const size_t dense_pair_max = dense_total / 2; - layout_add(per_worker_per_wpb_layout, sizeof(AffineElement) * dense_total, alignof(AffineElement)); - layout_add(per_worker_per_wpb_layout, sizeof(uint8_t) * dense_total, alignof(uint8_t)); - layout_add(per_worker_per_wpb_layout, - sizeof(std::pair) * dense_pair_max, - alignof(std::pair)); - layout_add(per_worker_per_wpb_layout, sizeof(uint32_t) * dense_pair_max, alignof(uint32_t)); - layout_add(per_worker_per_wpb_layout, sizeof(BaseField) * dense_pair_max, alignof(BaseField)); - layout_add(per_worker_per_wpb_layout, - sizeof(round_parallel_detail::AffineBucketChunkInfo) * windows_per_batch, - alignof(round_parallel_detail::AffineBucketChunkInfo)); - } - const size_t per_worker_bytes = align_up(per_worker_union_bytes + per_worker_per_wpb_layout, WORKER_SLAB_ALIGN); + // Per-worker layout via the canonical walk (single source of truth shared with + // `compute_arena_bytes_for_msm`). Pre-wpb-solve usage there passes wpb=0; here we + // pass the actual windows_per_batch so the Stage 6 wpb-dependent tail is included. + const round_parallel_detail::PerWorkerArenaLayout worker_layout(chunk_capacity, + global_max_overflow_per_window, + dedup_active, + phase_a_cluster_members_cap, + phase_a_cluster_offsets_cap, + windows_per_batch, + dense_stride_est); + constexpr size_t WORKER_SLAB_ALIGN = round_parallel_detail::PerWorkerArenaLayout::WORKER_SLAB_ALIGN; + const size_t per_worker_union_bytes = worker_layout.per_worker_union_bytes; + const size_t per_worker_bytes = worker_layout.per_worker_bytes; // Zone P extra (post-decision permanent state): window_sums + dedup state. Sized // with the strict alignment a bump cursor would apply. @@ -3384,12 +2379,12 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan(arena_base_addr & (WORKER_SLAB_ALIGN - 1)); + // absolute address `arena.data + bytes_P` is 64-aligned. + const size_t arena_base_misalign = static_cast(arena.base_addr & (WORKER_SLAB_ALIGN - 1)); const size_t bytes_P_min = align_up(bytes_P_prefix, alignof(Element)) + bytes_P_extra_layout; const size_t bytes_P = align_up(bytes_P_min + arena_base_misalign, WORKER_SLAB_ALIGN) - arena_base_misalign; // bytes_W: per_worker_bytes is already rounded to WORKER_SLAB_ALIGN, so consecutive @@ -3399,8 +2394,8 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan(size_t count) -> std::span { - return bump_alloc_within.template operator()(count, zone_P_cursor, bytes_P, 0); + return arena.template bump_alloc(count, zone_P_cursor, bytes_P, 0); }; auto zone_S_alloc = [&](size_t count) -> std::span { - return bump_alloc_within.template operator()(count, zone_S_cursor, bytes_S_total, bytes_P + bytes_W); + return arena.template bump_alloc(count, zone_S_cursor, bytes_S_total, bytes_P + bytes_W); }; - // Zone W is carved into per-worker slabs directly via `bump_alloc_within` below — each + // Zone W is carved into per-worker slabs directly via `MsmArena::bump_alloc` below — each // worker gets its own (cursor, bound) pair, so a single zone-wide allocator would not // capture the per-worker discipline. - // The pre-Phase-1 `arena_alloc` cursor is retired here — every subsequent allocation + // The pre-Phase-1 `MsmArena::alloc` cursor is retired here — every subsequent allocation // routes through `zone_P_alloc`, the per-worker Zone W allocators, or `zone_S_alloc`. // Zone W: per-worker union slab — Stage6a/6b ThreadScratch and PhaseA fields overlay the @@ -3429,8 +2424,7 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan(size_t count) -> std::span { - return bump_alloc_within.template operator()( - count, ts_fixed_cur, per_worker_union_bytes, bytes_P + slab_base); + return arena.template bump_alloc(count, ts_fixed_cur, per_worker_union_bytes, bytes_P + slab_base); }; s.curr_pts = ts_fixed_alloc.template operator()(chunk_capacity); s.curr_buckets = ts_fixed_alloc.template operator()(chunk_capacity); @@ -3446,17 +2440,17 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan(size_t count) -> std::span { - return bump_alloc_within.template operator()( - count, pa_cur, per_worker_union_bytes, bytes_P + slab_base); + return arena.template bump_alloc(count, pa_cur, per_worker_union_bytes, bytes_P + slab_base); }; auto& ps = phase_a_scratch[t]; + using PWAL = round_parallel_detail::PerWorkerArenaLayout; ps.cluster_members = pa_alloc.template operator()(phase_a_cluster_members_cap); ps.cluster_offsets = pa_alloc.template operator()(phase_a_cluster_offsets_cap); - ps.dirty_slots = pa_alloc.template operator()(PHASE_A_DIRTY_SLOTS_CAP); - ps.bucket_rep = pa_alloc.template operator()(PHASE_A_BUCKET_REP_CAP); - ps.staged = pa_alloc.template operator()>(PHASE_A_STAGED_CAP); - ps.chunk_pts = pa_alloc.template operator()(PHASE_A_CHUNK_CAP); - ps.chunk_ids = pa_alloc.template operator()(PHASE_A_CHUNK_CAP); + ps.dirty_slots = pa_alloc.template operator()(PWAL::PHASE_A_DIRTY_SLOTS_CAP); + ps.bucket_rep = pa_alloc.template operator()(PWAL::PHASE_A_BUCKET_REP_CAP); + ps.staged = pa_alloc.template operator()>(PWAL::PHASE_A_STAGED_CAP); + ps.chunk_pts = pa_alloc.template operator()(PWAL::PHASE_A_CHUNK_CAP); + ps.chunk_ids = pa_alloc.template operator()(PWAL::PHASE_A_CHUNK_CAP); } // Stage 6 wpb-dependent fields — tail of the per-worker slab, BEYOND the union. Bound @@ -3464,7 +2458,7 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan(size_t count) -> std::span { - return bump_alloc_within.template operator()(count, ts_tail_cur, per_worker_bytes, bytes_P + slab_base); + return arena.template bump_alloc(count, ts_tail_cur, per_worker_bytes, bytes_P + slab_base); }; const size_t dense_total = windows_per_batch * dense_stride_est; const size_t dense_pair_max = dense_total / 2; @@ -3480,9 +2474,7 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan(schedule_total); // ----- HIST slot ------------------------------------------------------------------ @@ -3638,15 +2630,12 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan(windows_per_batch * num_threads); auto orig_thread_hi = zone_S_alloc.template operator()(windows_per_batch * num_threads); - // Zone P: window_sums (Stage 7 cross-region accumulator — survives the whole MSM). - // SPLIT can produce more windows than the unsplit num_windows (small window_bits_hi → many - // tight upper windows). Sizing to the compile-time VAR_WINDOW_MAX_WINDOWS cap (12 KiB) - // avoids a per-region resize. + // Zone P: window_sums (Stage 7 accumulator — survives the whole MSM). auto window_sums = zone_P_alloc.template operator()(VAR_WINDOW_WINDOW_SUMS_CAP); std::fill_n(window_sums.begin(), VAR_WINDOW_WINDOW_SUMS_CAP, Curve::Group::point_at_infinity); - // Zone P: dedup state — written by Phase A, read through Stage 6a of every batch and - // (when SPLIT fires) the upper region, so it must outlive every batch. + // Zone P: dedup state — written by Phase A and read through Stage 6a of every batch, + // so it must outlive every batch. // - redirect_lookup: parallel-filled with DEDUP_INVALID_EXTRA below before Phase A reads it. // - extra_points: no init needed; Phase A writes per-thread cid ranges, and consumers // only read indices Phase A actually populated. @@ -3673,32 +2662,16 @@ typename Curve::Element pippenger_round_parallel(PolynomialSpan