Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 55 additions & 2 deletions engine/baml-lib/jsonish/src/jsonish/parser/entry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use baml_types::CompletionState;
use super::ParseOptions;
use crate::jsonish::{
parser::{
fixing_parser,
fixing_parser::{self, contains_unicode_quote_char, QuoteParityMode},
markdown_parser::{self, MarkdownResult},
multi_json_parser,
},
Expand Down Expand Up @@ -174,7 +174,60 @@ pub(super) fn parse_func(str: &str, mut options: ParseOptions, is_done: bool) ->
}

if options.allow_fixes {
match fixing_parser::parse(str, &options) {
// Strict pass: today's behaviour. Every ASCII-only input returns
// identical results to before this change.
let strict = fixing_parser::parse(str, &options, QuoteParityMode::AsciiOnly);

// Unicode-parity pass: only meaningful when the input actually
// contains a unicode quote. Skipping it on pure-ASCII input keeps
// the common case bit-identical and avoids doubling `AnyOf` depth
// gratuitously.
let unicode = if contains_unicode_quote_char(str) {
log::debug!(
"jsonish: running AllUnicode parity pass for input with non-ASCII quote char"
);
fixing_parser::parse(str, &options, QuoteParityMode::AllUnicode).ok()
} else {
None
};

// Merge: Unicode-parity items first so that when strict and Unicode
// passes produce structurally equivalent candidates (identical score),
// the Unicode candidate wins the index tiebreaker. Strict items that
// duplicate a Unicode item (by structural `Value` equality) are
// dropped; unique strict items are appended. On pure-ASCII inputs the
// Unicode pass is skipped entirely and `merged` == `strict`, preserving
// today's behaviour exactly. Fix tags are whatever the fixing parser
// itself produced — no new variant.
let merged: Result<Vec<(Value, Vec<Fixes>)>> = match strict {
Ok(strict_items) => {
if let Some(unicode_items) = unicode {
// Unicode candidates come first; append strict items that
// are not already represented.
let mut merged = unicode_items;
for (v, fixes) in strict_items {
if merged.iter().any(|(existing, _)| existing == &v) {
continue;
}
merged.push((v, fixes));
}
Ok(merged)
} else {
// No Unicode pass (pure-ASCII input): identical to today.
Ok(strict_items)
}
}
Err(e) => {
// The strict pass errored; fall back to the Unicode pass
// alone if it succeeded.
match unicode {
Some(items) => Ok(items),
None => Err(e),
}
}
};

match merged {
Ok(items) => {
match items.len() {
0 => {}
Expand Down
93 changes: 86 additions & 7 deletions engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,81 @@ use self::json_parse_state::JsonParseState;
use super::ParseOptions;
use crate::jsonish::{value::Fixes, Value};

pub fn parse(str: &str, _options: &ParseOptions) -> Result<Vec<(Value, Vec<Fixes>)>> {
/// Which quote codepoints participate in the `unescaped_quote_count` parity
/// check that gates closing an ASCII-quoted string on `,`.
///
/// `AsciiOnly` preserves today's behaviour: only ASCII `"` increments the
/// counter. `AllUnicode` additionally increments for unicode quotes
/// (see `UNICODE_QUOTE_CHARS`). The rest of the parser — opener selection,
/// structural-delimiter branches, escapes — does not consult this mode.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum QuoteParityMode {
AsciiOnly,
AllUnicode,
}

/// Unicode quotes that count toward `unescaped_quote_count` under
/// `QuoteParityMode::AllUnicode`.
///
/// Only marks that function as *primary* (double-quote-level) delimiters
/// across languages are included. Single-quote-role marks — curly singles
/// (U+2018 / U+2019), the single low-9 and angle variants (U+201A, U+2039,
/// U+203A), and the CJK white corner brackets (U+300E / U+300F) — are
/// deliberately excluded:
///
/// 1. Parity counting exists to detect an unbalanced opener at the *outer*
/// delimiter level that would make a stray ASCII `"` look like a
/// closer. Single-role marks appear only *nested* inside a primary
/// quote, so they don't function at that level and counting them does
/// not disambiguate.
/// 2. U+2019 RIGHT SINGLE QUOTATION MARK is the standard typographic
/// apostrophe ("It's"). Counting it inside an ASCII-quoted string
/// makes common text like `"It's fine", …` look unbalanced and
/// prevents the real ASCII `"` from closing. CJK has the same
/// double/single distinction (「」 primary vs 『』 nested); 300E/300F
/// are excluded for the same reason.
///
/// | Language | Delimiters | Codes | Example |
/// |-----------------------------------|-----------------|------------------|------------------------------|
/// | English (US/UK) | `“ ”` | 201C / 201D | He said: “hello.” |
/// | German — Gänsefüßchen | `„ “` | 201E / 201C | Er sagte: „hallo.“ |
/// | German — Chevrons | `» «` | 00BB / 00AB | Er sagte: »hallo«. |
/// | Polish | `„ ”` | 201E / 201D | Powiedział: „cześć”. |
/// | Czech / Slovak | `„ “` | 201E / 201C | Řekl: „ahoj.“ |
/// | Hungarian | `„ ”` | 201E / 201D | Azt mondta: „szia”. |
/// | French | `« »` | 00AB / 00BB | Il a dit : « bonjour ». |
/// | Russian | `« »` | 00AB / 00BB | Он сказал: «привет». |
/// | Spanish / Italian / Swiss / Greek | `« »` | 00AB / 00BB | Dijo: «hola». |
/// | Swedish / Finnish | `” ”` | 201D / 201D | Han sade: ”hej.” |
/// | Danish / Norwegian / Dutch | `“ ”` | 201C / 201D | Han sagde: “hej.” |
/// | Chinese (CN) | `“ ”` | 201C / 201D | 他说:“你好。” |
/// | Japanese / Chinese (TW/HK) | `「 」` | 300C / 300D | 彼は「こんにちは」と言った。 |
/// | Korean | `“ ”` or `「 」` | 201C/D or 300C/D | 그는 “안녕”이라고 말했다. |
/// | Hebrew | `״` | 05F4 | הוא אמר: ״שלום״. |
/// | Arabic | `« »` | 00AB / 00BB | قال: «مرحبا». |
pub(crate) const UNICODE_QUOTE_CHARS: &[char] = &[
'\u{00AB}', // « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
'\u{00BB}', // » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
'\u{201C}', // " LEFT DOUBLE QUOTATION MARK
'\u{201D}', // " RIGHT DOUBLE QUOTATION MARK
'\u{201E}', // „ DOUBLE LOW-9 QUOTATION MARK
'\u{300C}', // 「 LEFT CORNER BRACKET
'\u{300D}', // 」 RIGHT CORNER BRACKET
'\u{05F4}', // ״ HEBREW PUNCTUATION GERSHAYIM
];

/// Returns `true` if `s` contains at least one codepoint in
/// `UNICODE_QUOTE_CHARS`. Used by the entry cascade to skip the
/// `AllUnicode` parse pass when it would be a no-op (pure-ASCII input).
pub fn contains_unicode_quote_char(s: &str) -> bool {
s.chars().any(|c| UNICODE_QUOTE_CHARS.contains(&c))
}

pub fn parse(
str: &str,
_options: &ParseOptions,
quote_parity: QuoteParityMode,
) -> Result<Vec<(Value, Vec<Fixes>)>> {
// Try to fix some common JSON issues
// - Unquoted single word strings
// - Single quoted strings
Expand All @@ -32,7 +106,7 @@ pub fn parse(str: &str, _options: &ParseOptions) -> Result<Vec<(Value, Vec<Fixes
let mut chars = str.char_indices().peekable();
while let Some((count, c)) = chars.next() {
let peekable = str[count + c.len_utf8()..].char_indices().peekable();
match state.process_token(c, peekable) {
match state.process_token(c, peekable, quote_parity) {
Ok(increments) => {
for _ in 0..increments {
chars.next();
Expand Down Expand Up @@ -105,7 +179,7 @@ mod tests {
#[test]
fn test_partial_array() {
let opts = ParseOptions::default();
let vals = parse("[12", &opts).unwrap();
let vals = parse("[12", &opts, QuoteParityMode::AsciiOnly).unwrap();

match vals[0].0.clone() {
Value::Array(xs, array_cmplt) => {
Expand All @@ -126,7 +200,7 @@ mod tests {
#[test]
fn test_partial_object() {
let opts = ParseOptions::default();
let vals = parse(r#"{"a": 11, "b": 22"#, &opts).unwrap();
let vals = parse(r#"{"a": 11, "b": 22"#, &opts, QuoteParityMode::AsciiOnly).unwrap();
match &vals[0].0 {
Value::Object(fields, obj_cmplt) => {
assert_eq!(fields.len(), 2);
Expand All @@ -150,7 +224,12 @@ mod tests {
#[test]
fn test_partial_object_newlines() {
let opts = ParseOptions::default();
let vals = parse("{\n \"a\": 11, \n \"b\": 22", &opts).unwrap();
let vals = parse(
"{\n \"a\": 11, \n \"b\": 22",
&opts,
QuoteParityMode::AsciiOnly,
)
.unwrap();
match &vals[0].0 {
Value::Object(fields, obj_cmplt) => {
assert_eq!(fields.len(), 2);
Expand Down Expand Up @@ -186,7 +265,7 @@ mod tests {
// Without the fix, the off-by-one causes the last char to be
// re-processed, creating a spurious key-value pair.
let opts = ParseOptions::default();
let vals = parse(r#"{mykey"#, &opts).unwrap();
let vals = parse(r#"{mykey"#, &opts, QuoteParityMode::AsciiOnly).unwrap();
match &vals[0].0 {
Value::Object(fields, _) => {
assert_eq!(fields.len(), 0, "No complete key-value pair yet");
Expand All @@ -200,7 +279,7 @@ mod tests {
// InNothing: unquoted string at top level, stream ends without '{' or '['.
// Without the fix, the off-by-one corrupts parsing so no value is produced.
let opts = ParseOptions::default();
let vals = parse("foobar", &opts).unwrap();
let vals = parse("foobar", &opts, QuoteParityMode::AsciiOnly).unwrap();
match &vals[0].0 {
Value::String(s, cmplt) => {
assert_eq!(
Expand Down
Loading
Loading