From 825327bbc2ea18f2b876a3c130998dfb1392260a Mon Sep 17 00:00:00 2001 From: xodn348 Date: Sun, 17 May 2026 11:20:11 +0000 Subject: [PATCH] fix(bpe): widen pair_counts from i32 to i64 to prevent overflow on large corpora MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On corpora where any token pair appears more than i32::MAX (~2.1B) times, the pair-count map wraps to a negative value and that pair is silently dropped from the merge queue even though it is the most frequent pair. Code corpora are especially susceptible because a run of spaces can push the (space, space) pair count well past the threshold. Widen the accumulator type from i32 to i64 (max ~9.2e18) throughout count_pairs and the per-word update in do_train: - AHashMap → AHashMap - counts[i] as i32 → counts[i] as i64 - change * counts[iw] as i32 → (change as i64) * counts[iw] as i64 No other behaviour changes; all 200 lib unit tests pass, rustfmt clean, clippy -D warnings clean. Fixes #2058 --- tokenizers/src/models/bpe/trainer.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs index df68c655e..1714bfa74 100644 --- a/tokenizers/src/models/bpe/trainer.rs +++ b/tokenizers/src/models/bpe/trainer.rs @@ -416,7 +416,7 @@ impl BpeTrainer { words: &[Word], counts: &[u64], p: &Option, - ) -> (AHashMap, AHashMap>) { + ) -> (AHashMap, AHashMap>) { words .maybe_par_iter() .enumerate() @@ -429,7 +429,7 @@ impl BpeTrainer { // Initialize pair_counts and where_to_update for this pair if we just saw it // Then update counts - *pair_counts.entry(cur_pair).or_default() += counts[i] as i32; + *pair_counts.entry(cur_pair).or_default() += counts[i] as i64; where_to_update.entry(cur_pair).or_default().insert(i); } @@ -581,7 +581,7 @@ impl BpeTrainer { // Introduce new formed pairs for ((pair, change), iw) in changes { - let count = change * counts[iw] as i32; + let count = (change as i64) * counts[iw] as i64; *pair_counts.entry(pair).or_default() += count; if change > 0 { where_to_update.entry(pair).or_default().insert(iw);