Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion bindings/node/lib/bindings/post-processors.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ describe('bertProcessing', () => {
})

it('throws if only one argument is provided', () => {
expect(() => (bertProcessing as any)(['sep', 1])).toThrow('Given napi value is not an array')
expect(() => (bertProcessing as any)(['sep', 1])).toThrow(
/Given napi value is not an array|Failed to get Array length/,
)
})

it('throws if arguments are malformed', () => {
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ crate-type = ["cdylib"]
[dependencies]
rayon = "1.10"
serde = { version = "1.0", features = ["rc", "derive"] }
serde_json = "1.0"
serde_json = { version = "1.0", features = ["preserve_order"] }
libc = "0.2"
env_logger = "0.11"
pyo3 = { version = "=0.28.2", default-features = false, features = ["py-clone", "experimental-inspect"] }
Expand Down
77 changes: 77 additions & 0 deletions bindings/python/src/utils/serde_pyo3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@ use serde::de::value::Error;
use serde::{ser, Serialize};
type Result<T> = ::std::result::Result<T, Error>;

/// Magic name serde_json's `RawValue` uses to smuggle pre-serialized JSON through a
/// `Serializer`. The core tokenizers crate uses `RawValue` to keep vocab/merges/added_tokens
/// on a single line in tokenizer.json; when our custom Python-repr serializer encounters
/// the same protocol it must unwrap the inner JSON and render it normally, otherwise the
/// repr leaks `$serde_json::private::RawValue(...)` markers.
const RAW_VALUE_TOKEN: &str = "$serde_json::private::RawValue";

pub struct Serializer {
// This string starts empty and JSON is appended as values are serialized.
output: String,
Expand All @@ -13,6 +20,13 @@ pub struct Serializer {
/// Maximum string representation
/// Useful to ellipsis precompiled_charmap
max_string: usize,
/// Set while we are inside a `serde_json::value::RawValue` struct: the next
/// `SerializeStruct::serialize_field` should treat its payload as raw JSON, parse it,
/// and recurse so the inner shape renders as Python repr instead of the raw token.
in_raw_value: bool,
/// Number of `SerializeStruct::end` calls that should suppress their closing paren
/// because they correspond to RawValue structs we already consumed.
raw_value_pending_ends: usize,
}

// By convention, the public API of a Serde serializer is one or more `to_abc`
Expand All @@ -34,6 +48,8 @@ where
max_elements,
num_elements: vec![0; max_depth],
max_string,
in_raw_value: false,
raw_value_pending_ends: 0,
};
value.serialize(&mut serializer)?;
Ok(serializer.output)
Expand All @@ -52,6 +68,8 @@ where
max_elements: 100,
num_elements: vec![0; max_depth],
max_string,
in_raw_value: false,
raw_value_pending_ends: 0,
};
value.serialize(&mut serializer)?;
Ok(serializer.output)
Expand Down Expand Up @@ -317,6 +335,14 @@ impl ser::Serializer for &mut Serializer {
// Deserialize implementation is required to know what the keys are without
// looking at the serialized data.
fn serialize_struct(self, name: &'static str, _len: usize) -> Result<Self::SerializeStruct> {
// serde_json's `RawValue` smuggles pre-serialized JSON through a fake struct with
// this magic name. Don't emit the marker — set a flag so the upcoming field is
// parsed back into a `serde_json::Value` and rendered through Self normally.
if name == RAW_VALUE_TOKEN {
self.in_raw_value = true;
self.raw_value_pending_ends += 1;
return Ok(self);
}
// self.serialize_map(Some(len))
// name.serialize(&mut *self)?;
if let Some(stripped) = name.strip_suffix("Helper") {
Expand Down Expand Up @@ -567,6 +593,19 @@ impl ser::SerializeStruct for &mut Serializer {
where
T: ?Sized + Serialize,
{
if self.in_raw_value && key == RAW_VALUE_TOKEN {
// Inside a serde_json::value::RawValue: the payload is a `&str` carrying raw
// JSON. Pull the str out via serde_json::to_value, parse it back into a Value,
// then recurse so the inner shape renders through Self in the normal way.
self.in_raw_value = false;
let v = serde_json::to_value(value).map_err(|e| ser::Error::custom(e.to_string()))?;
let raw_json = v
.as_str()
.ok_or_else(|| ser::Error::custom("RawValue payload was not a string"))?;
let parsed: serde_json::Value =
serde_json::from_str(raw_json).map_err(|e| ser::Error::custom(e.to_string()))?;
return parsed.serialize(&mut **self);
}
if !self.output.ends_with('(') {
self.output += ", ";
}
Expand All @@ -581,6 +620,12 @@ impl ser::SerializeStruct for &mut Serializer {
}

fn end(self) -> Result<()> {
if self.raw_value_pending_ends > 0 {
// This `end()` closes a RawValue struct whose body we already rendered above;
// skip the closing paren and the level decrement we never applied.
self.raw_value_pending_ends -= 1;
return Ok(());
}
self.num_elements[self.level] = 0;
self.level = self.level.saturating_sub(1);
self.output += ")";
Expand Down Expand Up @@ -624,6 +669,38 @@ fn test_basic() {
assert_eq!(to_string(&None::<usize>).unwrap(), "None");
}

#[test]
fn test_raw_value_unwraps_to_python_repr() {
use serde_json::value::RawValue;

#[derive(Serialize)]
struct Outer<'a> {
inline: &'a RawValue,
}

// RawValue contents must be rendered through Self (Python-style), not as the
// `$serde_json::private::RawValue(...)` marker that serde_json's protocol leaks.
let empty_arr = RawValue::from_string("[]".to_string()).unwrap();
let empty_obj = RawValue::from_string("{}".to_string()).unwrap();
let nested = RawValue::from_string(
r#"[{"id":0,"content":"x","flag":true,"score":1.5,"none":null}]"#.to_string(),
)
.unwrap();

assert_eq!(
to_string(&Outer { inline: &empty_arr }).unwrap(),
"Outer(inline=[])"
);
assert_eq!(
to_string(&Outer { inline: &empty_obj }).unwrap(),
"Outer(inline={})"
);
assert_eq!(
to_string(&Outer { inline: &nested }).unwrap(),
r#"Outer(inline=[{"id":0, "content":"x", "flag":True, "score":1.5, "none":None}])"#
);
}

#[test]
fn test_struct() {
#[derive(Serialize)]
Expand Down
2 changes: 1 addition & 1 deletion tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ regex-syntax = "0.8"
rayon = "1.10"
rayon-cond = "0.4"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_json = { version = "1.0", features = ["raw_value"] }
unicode-normalization-alignments = "0.1"
unicode_categories = "0.1"
unicode-segmentation = "1.11"
Expand Down
8 changes: 7 additions & 1 deletion tokenizers/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ dir_guard=@mkdir -p $(@D)

SHARED_RESOURCES = $(DATA_DIR)/gpt2-vocab.json $(DATA_DIR)/gpt2-merges.txt $(DATA_DIR)/bert-base-uncased-vocab.txt $(DATA_DIR)/big.txt $(DATA_DIR)/small.txt $(DATA_DIR)/albert-base-v1-tokenizer.json $(DATA_DIR)/llama-3-tokenizer.json
BENCHMARK_RESOURCES = $(SHARED_RESOURCES)
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json
TESTS_RESOURCES = $(SHARED_RESOURCES) $(DATA_DIR)/unigram.json $(DATA_DIR)/unigram_wagahaiwa_nekodearu.txt $(DATA_DIR)/roberta.json $(DATA_DIR)/tokenizer-wiki.json $(DATA_DIR)/bert-wiki.json $(DATA_DIR)/deepseek-v4-flash-base-tokenizer.json

.PHONY : build
build :
Expand Down Expand Up @@ -94,3 +94,9 @@ $(DATA_DIR)/bert-wiki.json :
$(DATA_DIR)/llama-3-tokenizer.json :
$(dir_guard)
wget $(HF_TEST_DATA)/llama-3-tokenizer.json -O $@

# Old-format (pretty) tokenizer.json used to assert backward compatibility with
# files produced before vocab/merges/added_tokens were compacted on save.
$(DATA_DIR)/deepseek-v4-flash-base-tokenizer.json :
$(dir_guard)
wget https://huggingface.co/deepseek-ai/DeepSeek-V4-Flash-Base/resolve/main/tokenizer.json -O $@
22 changes: 21 additions & 1 deletion tokenizers/src/models/bpe/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,25 @@ use serde::{
Deserialize, Deserializer, Serialize, Serializer,
};

/// Wraps the BPE merges so the entire array is emitted as a single compact line even when
/// the outer serializer is pretty. Avoids the multi-line per-pair indentation that bloats
/// tokenizer.json files.
struct CompactMerges<'a> {
merges: &'a [(String, String)],
}

impl Serialize for CompactMerges<'_> {
fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
where
S: Serializer,
{
let compact = serde_json::to_string(self.merges).map_err(serde::ser::Error::custom)?;
let raw =
serde_json::value::RawValue::from_string(compact).map_err(serde::ser::Error::custom)?;
raw.serialize(serializer)
}
}

impl Serialize for BPE {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
Expand Down Expand Up @@ -35,9 +54,10 @@ impl Serialize for BPE {
.map(|(pair, _)| (self.vocab_r[&pair.0].clone(), self.vocab_r[&pair.1].clone()))
.collect::<Vec<_>>();
let ordered_vocab = OrderedVocabIter::new(&self.vocab_r);
let compact_merges = CompactMerges { merges: &merges };

model.serialize_field("vocab", &ordered_vocab)?;
model.serialize_field("merges", &merges)?;
model.serialize_field("merges", &compact_merges)?;

model.end()
}
Expand Down
31 changes: 21 additions & 10 deletions tokenizers/src/models/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,35 @@ impl Serialize for OrderedVocabIter<'_> {
{
// There could be holes so max + 1 is more correct than vocab_r.len()
let mut holes = vec![];
let result = if let Some(max) = self.vocab_r.keys().max() {
let iter = (0..*max + 1).filter_map(|i| {
let mut buf = String::from("{");
if let Some(max) = self.vocab_r.keys().max() {
let mut first = true;
for i in 0..*max + 1 {
if let Some(token) = self.vocab_r.get(&i) {
Some((token, i))
if !first {
buf.push(',');
}
first = false;
let key = serde_json::to_string(token).map_err(serde::ser::Error::custom)?;
buf.push_str(&key);
buf.push(':');
buf.push_str(&i.to_string());
} else {
holes.push(i);
None
}
});
serializer.collect_map(iter)
} else {
serializer.collect_map(std::iter::empty::<(&str, u32)>())
};
}
}
buf.push('}');

if !holes.is_empty() {
warn!("The OrderedVocab you are attempting to serialize contains holes for indices {holes:?}, your vocabulary could be corrupted!");
}
result

// Emit the vocab as a pre-serialized compact JSON value so that pretty-printers
// do not expand it across thousands of lines.
let raw =
serde_json::value::RawValue::from_string(buf).map_err(serde::ser::Error::custom)?;
raw.serialize(serializer)
}
}

Expand Down
13 changes: 6 additions & 7 deletions tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use super::{
use ahash::{AHashMap, AHashSet};
use daachorse::{DoubleArrayAhoCorasick, DoubleArrayAhoCorasickBuilder, MatchKind};
use regex::Regex;
use serde::{ser::SerializeSeq, Deserialize, Serialize, Serializer};
use serde::{Deserialize, Serialize, Serializer};
use std::sync::LazyLock;

/// Represent a token added by the user on top of the existing Model vocabulary.
Expand Down Expand Up @@ -595,12 +595,11 @@ impl Serialize for AddedVocabulary {
// We need to have these added tokens ordered by ascending ID
added_tokens.sort_unstable_by_key(|o| o.id);

let mut vocabulary = serializer.serialize_seq(Some(added_tokens.len()))?;
for token in added_tokens {
vocabulary.serialize_element(&token)?;
}

vocabulary.end()
// Serialize the whole array compactly so pretty output keeps it on a single line.
let compact = serde_json::to_string(&added_tokens).map_err(serde::ser::Error::custom)?;
let raw =
serde_json::value::RawValue::from_string(compact).map_err(serde::ser::Error::custom)?;
raw.serialize(serializer)
}
}

Expand Down
Loading
Loading