From d28a6bad209d49883bda2c05645d193be296e738 Mon Sep 17 00:00:00 2001 From: stargazerZJ <53366576+stargazerZJ@users.noreply.github.com> Date: Sat, 2 May 2026 00:26:05 +0800 Subject: [PATCH] Add scaling_bench: encode_batch vs worker-pool comparison (#1900) Standalone benchmark binary (`harness = false`) that compares `Tokenizer::encode_batch` against an explicit `rayon::ThreadPoolBuilder` + `par_iter()` worker pool over `Tokenizer::encode`, on the same data, and reports the ratio of their wall times. Deep-dive bench for maintainers and contributors working on the parallel-encode path; not wired into CI because the regression it measures only manifests on machines with many cores (>=16) under realistic batch shapes, which the `ubuntu-latest` benchmark runner cannot provide. The existing `ci_benchmark.rs::concurrent-4t` group remains the in-CI watchdog. Two synthesized workloads (random-letters, repeated-words) stress the encode hot path and the BPE-cache + memory-pressure path respectively; `--input ` overrides with a real corpus. No new dependencies (hand-rolled CLI parser); no library changes; no CI changes. Co-Authored-By: Claude Opus 4.7 (1M context) --- tokenizers/Cargo.toml | 4 + tokenizers/benches/scaling_bench.rs | 534 ++++++++++++++++++++++++++++ 2 files changed, 538 insertions(+) create mode 100644 tokenizers/benches/scaling_bench.rs diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index d2841313e..07b2ef60e 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -66,6 +66,10 @@ harness = false name = "ci_benchmark" harness = false +[[bench]] +name = "scaling_bench" +harness = false + [dependencies] rand = "0.9" onig = { version = "6.5.1", default-features = false, optional = true } diff --git a/tokenizers/benches/scaling_bench.rs b/tokenizers/benches/scaling_bench.rs new file mode 100644 index 000000000..9a7be7c1d --- /dev/null +++ b/tokenizers/benches/scaling_bench.rs @@ -0,0 +1,534 @@ +//! Scaling bench for the `encode_batch` vs manual worker-pool comparison +//! described in https://github.com/huggingface/tokenizers/issues/1900. +//! +//! This is a deep-dive benchmark, not a CI watchdog. It is structured as a +//! standalone binary (with `harness = false`) rather than a criterion bench +//! because the regression it measures only manifests on machines with many +//! cores (≥16, ideally ≥64) under realistic batch shapes (large documents, +//! batch sizes ≥ 1024, hundreds of documents). The default GitHub Actions +//! `ubuntu-latest` runner has 4 vCPUs, where the asymmetry is small (≤1.2×) +//! and indistinguishable from noise; running the bench there would not +//! produce a useful regression signal. Maintainers run this manually on a +//! large machine; the existing `ci_benchmark.rs::concurrent-4t` group is +//! the in-CI watchdog. +//! +//! Two methods, both via the public Tokenizer API: +//! +//! * **`worker-pool`** — explicit `rayon::ThreadPoolBuilder` + `par_iter()` +//! over `Tokenizer::encode`. Each `Encoding` is consumed (token count +//! read, then dropped) inside the closure, so it is allocated and freed +//! on the same worker thread. +//! +//! * **`encode-batch`** — stock `Tokenizer::encode_batch` per chunk of +//! `--batch-size`. The returned `Vec` is iterated on the main +//! thread and dropped there. +//! +//! Headline metric: `encode_batch_elapsed / worker_pool_elapsed`. On a +//! 128-core x86_64 box with glibc, against current `main` (which already +//! includes the per-thread BPE cache fix from #2028), this is ~2×. After +//! @sebpop's planned `Encoding`-recycling work lands, it should drop to +//! ~1×. See https://github.com/stargazerZJ/tokenizers-1900-repro for a +//! standalone reproducer with drop-site A/B, allocator-swap, and +//! `MALLOC_ARENA_MAX` experiments separating the contributing effects. +//! +//! Two workloads (run by default; pick one with `--workload`): +//! +//! * **`random-letters`** — synthesized random a-zA-Z with no whitespace, +//! exercising the BPE merge path (low cache hit rate). +//! * **`repeated-words`** — synthesized short pseudo-words separated by +//! spaces, exercising the BPE cache (high cache hit rate). +//! +//! The two regimes respond differently to fixes; running only one risks +//! over-fitting future PRs to one and silently regressing the other. +//! +//! Run it: +//! +//! ```text +//! cargo bench --bench scaling_bench -- +//! cargo bench --bench scaling_bench -- --workers 32 --batch-size 1024 --count 1000 --length 8192 +//! cargo bench --bench scaling_bench -- --quick # small, finishes in < 30s +//! cargo bench --bench scaling_bench -- --workload repeated-words +//! cargo bench --bench scaling_bench -- --input data/big.txt # real corpus +//! ``` +//! +//! Allocator caveat: the magnitude of the ratio depends heavily on the +//! system allocator. On glibc the gap is widest; jemalloc roughly halves it; +//! mimalloc is workload-dependent. The *direction* of the asymmetry is +//! stable across allocators — that is the regression signal. + +use rayon::prelude::*; +use std::path::PathBuf; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::Instant; +use tokenizers::Tokenizer; + +// --------------------------------------------------------------------------- +// Defaults +// --------------------------------------------------------------------------- + +/// Default tokenizer (already fetched by `make bench` into `data/`). +const DEFAULT_TOKENIZER: &str = "data/llama-3-tokenizer.json"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Method { + WorkerPool, + EncodeBatch, + Both, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Workload { + RandomLetters, + RepeatedWords, + Both, +} + +#[derive(Debug)] +struct Args { + tokenizer: PathBuf, + workers: usize, + batch_size: usize, + count: usize, + length: usize, + word_vocab_size: usize, + word_len: usize, + seed: u64, + method: Method, + workload: Workload, + input: Option, +} + +impl Args { + fn defaults() -> Self { + Self { + tokenizer: PathBuf::from(DEFAULT_TOKENIZER), + // 0 = num_cpus (resolved later) + workers: 0, + batch_size: 1024, + count: 500, + length: 51200, + word_vocab_size: 2048, + word_len: 8, + seed: 42, + method: Method::Both, + workload: Workload::Both, + input: None, + } + } + + fn quick() -> Self { + Self { + count: 100, + length: 8192, + batch_size: 256, + ..Self::defaults() + } + } + + fn parse() -> Result { + // First pass: was --quick passed? If so, start from quick defaults. + let raw: Vec = std::env::args().skip(1).collect(); + let mut args = if raw.iter().any(|a| a == "--quick") { + Self::quick() + } else { + Self::defaults() + }; + + let mut i = 0; + while i < raw.len() { + let a = &raw[i]; + let take_value = |label: &str, i: &mut usize| -> Result { + *i += 1; + raw.get(*i) + .cloned() + .ok_or_else(|| format!("{label} needs a value")) + }; + match a.as_str() { + "-h" | "--help" => { + print_help(); + std::process::exit(0); + } + "--quick" => { /* handled above */ } + "-t" | "--tokenizer" => { + args.tokenizer = PathBuf::from(take_value("--tokenizer", &mut i)?); + } + "-w" | "--workers" => { + args.workers = take_value("--workers", &mut i)? + .parse() + .map_err(|e| format!("--workers: {e}"))?; + } + "-b" | "--batch-size" => { + args.batch_size = take_value("--batch-size", &mut i)? + .parse() + .map_err(|e| format!("--batch-size: {e}"))?; + } + "--count" => { + args.count = take_value("--count", &mut i)? + .parse() + .map_err(|e| format!("--count: {e}"))?; + } + "--length" => { + args.length = take_value("--length", &mut i)? + .parse() + .map_err(|e| format!("--length: {e}"))?; + } + "--word-vocab-size" => { + args.word_vocab_size = take_value("--word-vocab-size", &mut i)? + .parse() + .map_err(|e| format!("--word-vocab-size: {e}"))?; + } + "--word-len" => { + args.word_len = take_value("--word-len", &mut i)? + .parse() + .map_err(|e| format!("--word-len: {e}"))?; + } + "--seed" => { + args.seed = take_value("--seed", &mut i)? + .parse() + .map_err(|e| format!("--seed: {e}"))?; + } + "--method" => { + args.method = match take_value("--method", &mut i)?.as_str() { + "worker-pool" | "worker_pool" => Method::WorkerPool, + "encode-batch" | "encode_batch" => Method::EncodeBatch, + "both" => Method::Both, + other => { + return Err(format!( + "--method: {other:?} (want worker-pool|encode-batch|both)" + )) + } + }; + } + "--workload" => { + args.workload = match take_value("--workload", &mut i)?.as_str() { + "random-letters" | "random_letters" => Workload::RandomLetters, + "repeated-words" | "repeated_words" => Workload::RepeatedWords, + "both" => Workload::Both, + other => { + return Err(format!( + "--workload: {other:?} \ + (want random-letters|repeated-words|both)" + )) + } + }; + } + "--input" => { + args.input = Some(PathBuf::from(take_value("--input", &mut i)?)); + } + // Criterion-compatibility: ignore noise from `cargo bench`. + "--bench" | "--test" | "--save-baseline" | "--baseline" => { + if a == "--save-baseline" || a == "--baseline" { + // these take values + let _ = take_value(a, &mut i); + } + } + other => return Err(format!("unknown argument: {other}")), + } + i += 1; + } + + if args.workers == 0 { + args.workers = std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(8); + } + Ok(args) + } +} + +fn print_help() { + eprintln!( + "scaling_bench — encode_batch vs worker-pool comparison for tokenizers#1900\n\ + \n\ + USAGE:\n \ + cargo bench --bench scaling_bench -- [OPTIONS]\n\ + \n\ + OPTIONS:\n \ + -t, --tokenizer Path to tokenizer.json [default: {DEFAULT_TOKENIZER}]\n \ + -w, --workers Worker threads (0 = num_cpus) [default: num_cpus]\n \ + -b, --batch-size Batch size for encode_batch [default: 1024]\n \ + --count Number of synthetic documents [default: 500]\n \ + --length Bytes per synthetic document [default: 51200]\n \ + --word-vocab-size Vocab size (repeated-words workload) [default: 2048]\n \ + --word-len Word length (repeated-words workload) [default: 8]\n \ + --seed PRNG seed [default: 42]\n \ + --method worker-pool|encode-batch|both [default: both]\n \ + --workload random-letters|repeated-words|both [default: both]\n \ + --input Use file lines instead of synthetic data\n \ + --quick Smaller defaults (~30s on a many-core box)\n \ + --help Show this help\n\ + \n\ + The headline metric is encode_batch_elapsed / worker_pool_elapsed.\n\ + See https://github.com/huggingface/tokenizers/issues/1900 for context\n\ + and https://github.com/stargazerZJ/tokenizers-1900-repro for the\n\ + standalone reproducer with allocator-swap and drop-site experiments." + ); +} + +// --------------------------------------------------------------------------- +// Synthetic data generators +// --------------------------------------------------------------------------- + +fn generate_random_letters(count: usize, length: usize, seed: u64) -> Vec { + use rand::{Rng, SeedableRng}; + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + eprintln!("[scaling_bench] generating {count} random-letters strings × {length} bytes"); + (0..count) + .map(|_| { + (0..length) + .map(|_| { + let idx: u8 = rng.random_range(0u8..52); + if idx < 26 { + (b'a' + idx) as char + } else { + (b'A' + idx - 26) as char + } + }) + .collect() + }) + .collect() +} + +fn generate_repeated_words( + count: usize, + length: usize, + seed: u64, + vocab_size: usize, + word_len: usize, +) -> Vec { + use rand::{Rng, SeedableRng}; + let mut rng = rand::rngs::StdRng::seed_from_u64(seed); + eprintln!( + "[scaling_bench] generating {count} repeated-words strings × {length} bytes \ + (vocab={vocab_size}, word_len={word_len})" + ); + let vocab: Vec = (0..vocab_size) + .map(|_| { + (0..word_len) + .map(|_| { + let idx: u8 = rng.random_range(0u8..52); + if idx < 26 { + (b'a' + idx) as char + } else { + (b'A' + idx - 26) as char + } + }) + .collect() + }) + .collect(); + + (0..count) + .map(|_| { + let mut s = String::with_capacity(length + word_len + 1); + while s.len() < length { + if !s.is_empty() { + s.push(' '); + } + s.push_str(&vocab[rng.random_range(0..vocab.len())]); + } + s.truncate(length); + s + }) + .collect() +} + +fn load_input_file(path: &std::path::Path, target_count: usize) -> Result, String> { + let data = std::fs::read_to_string(path).map_err(|e| format!("read {path:?}: {e}"))?; + let lines: Vec = data.lines().map(|s| s.to_string()).collect(); + if lines.is_empty() { + return Err(format!("{path:?}: no lines")); + } + eprintln!( + "[scaling_bench] loaded {} lines from {path:?}; cycling to reach {target_count} docs", + lines.len() + ); + Ok((0..target_count) + .map(|i| lines[i % lines.len()].clone()) + .collect()) +} + +// --------------------------------------------------------------------------- +// Stats accumulator +// --------------------------------------------------------------------------- + +struct Stats { + docs: AtomicUsize, + tokens: AtomicU64, + bytes: AtomicU64, + start: Instant, +} + +impl Stats { + fn new() -> Self { + Self { + docs: AtomicUsize::new(0), + tokens: AtomicU64::new(0), + bytes: AtomicU64::new(0), + start: Instant::now(), + } + } + fn add(&self, t: usize, b: usize) { + self.docs.fetch_add(1, Ordering::Relaxed); + self.tokens.fetch_add(t as u64, Ordering::Relaxed); + self.bytes.fetch_add(b as u64, Ordering::Relaxed); + } + fn report(&self, label: &str) -> f64 { + let elapsed = self.start.elapsed().as_secs_f64(); + let d = self.docs.load(Ordering::Relaxed); + let t = self.tokens.load(Ordering::Relaxed); + let b = self.bytes.load(Ordering::Relaxed) as f64; + println!( + "[{label}] docs={d} ({:.1}/s) tokens={t} ({:.1}/s) \ + bytes={:.2}MiB ({:.2}MiB/s) elapsed={:.2}s", + d as f64 / elapsed, + t as f64 / elapsed, + b / (1024.0 * 1024.0), + b / (1024.0 * 1024.0) / elapsed, + elapsed, + ); + elapsed + } +} + +// --------------------------------------------------------------------------- +// Methods +// --------------------------------------------------------------------------- + +/// Manual rayon worker-pool: encode in parallel, consume + drop each +/// `Encoding` inside the worker closure (alloc + free on the same thread). +fn run_worker_pool( + tokenizer: &Tokenizer, + texts: &[String], + num_workers: usize, +) -> Result { + let stats = Stats::new(); + let pool = rayon::ThreadPoolBuilder::new() + .num_threads(num_workers) + .build() + .map_err(|e| format!("rayon pool: {e}"))?; + + pool.install(|| { + texts.par_iter().for_each(|text| { + if let Ok(enc) = tokenizer.encode(text.as_str(), false) { + stats.add(enc.get_ids().len(), text.len()); + // enc dropped here, on the worker thread that allocated it. + } + }); + }); + + Ok(stats.report("worker_pool")) +} + +/// Stock `Tokenizer::encode_batch` per chunk; iterate the returned vec on +/// main and drop there. +fn run_encode_batch( + tokenizer: &Tokenizer, + texts: &[String], + batch_size: usize, +) -> Result { + let stats = Stats::new(); + for chunk in texts.chunks(batch_size) { + let inputs: Vec<&str> = chunk.iter().map(|s| s.as_str()).collect(); + let encs = tokenizer + .encode_batch(inputs, false) + .map_err(|e| format!("encode_batch: {e}"))?; + for (i, e) in encs.iter().enumerate() { + stats.add(e.get_ids().len(), chunk[i].len()); + } + // encs drops here, on main; this is the cross-thread free site. + } + Ok(stats.report("encode_batch")) +} + +// --------------------------------------------------------------------------- +// Driver +// --------------------------------------------------------------------------- + +fn run_workload( + label: &str, + args: &Args, + tokenizer: &Tokenizer, + texts: &[String], +) -> Result<(), String> { + println!("\n=== workload: {label} ==="); + let mut wp: Option = None; + let mut eb: Option = None; + + if matches!(args.method, Method::WorkerPool | Method::Both) { + wp = Some(run_worker_pool(tokenizer, texts, args.workers)?); + } + if matches!(args.method, Method::EncodeBatch | Method::Both) { + eb = Some(run_encode_batch(tokenizer, texts, args.batch_size)?); + } + + if let (Some(wp), Some(eb)) = (wp, eb) { + println!( + "[ratio:{label}] encode_batch / worker_pool = {:.2}× \ + ({eb:.2}s / {wp:.2}s)", + eb / wp, + ); + } + Ok(()) +} + +fn main() -> Result<(), String> { + let args = Args::parse().map_err(|e| { + eprintln!("{e}\n"); + print_help(); + e + })?; + + println!( + "[scaling_bench] tokenizer={:?} workers={} batch_size={} count={} length={} method={:?} workload={:?}", + args.tokenizer, args.workers, args.batch_size, args.count, args.length, args.method, args.workload, + ); + if !args.tokenizer.exists() { + return Err(format!( + "tokenizer file not found: {:?}\n\ + Run `make bench` from the tokenizers/ directory to fetch it, \ + or pass --tokenizer .", + args.tokenizer + )); + } + + let tokenizer = + Tokenizer::from_file(&args.tokenizer).map_err(|e| format!("load tokenizer: {e}"))?; + let tokenizer = Arc::new(tokenizer); + + let workloads: &[Workload] = match args.workload { + Workload::Both => &[Workload::RandomLetters, Workload::RepeatedWords], + single => std::slice::from_ref(match &single { + Workload::RandomLetters => &Workload::RandomLetters, + Workload::RepeatedWords => &Workload::RepeatedWords, + Workload::Both => unreachable!(), + }), + }; + + for &w in workloads { + let (label, texts) = match (w, &args.input) { + (_, Some(path)) => ("input-file", load_input_file(path, args.count)?), + (Workload::RandomLetters, _) => ( + "random-letters", + generate_random_letters(args.count, args.length, args.seed), + ), + (Workload::RepeatedWords, _) => ( + "repeated-words", + generate_repeated_words( + args.count, + args.length, + args.seed, + args.word_vocab_size, + args.word_len, + ), + ), + (Workload::Both, _) => unreachable!(), + }; + run_workload(label, &args, &tokenizer, &texts)?; + // If --input is set, the workload distinction is moot — only run once. + if args.input.is_some() { + break; + } + } + Ok(()) +}