diff --git a/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs b/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs index 52531eb219..92be9cca42 100644 --- a/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs +++ b/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs @@ -4,7 +4,7 @@ use baml_types::CompletionState; use super::ParseOptions; use crate::jsonish::{ parser::{ - fixing_parser, + fixing_parser::{self, contains_unicode_quote_char, QuoteParityMode}, markdown_parser::{self, MarkdownResult}, multi_json_parser, }, @@ -174,7 +174,60 @@ pub(super) fn parse_func(str: &str, mut options: ParseOptions, is_done: bool) -> } if options.allow_fixes { - match fixing_parser::parse(str, &options) { + // Strict pass: today's behaviour. Every ASCII-only input returns + // identical results to before this change. + let strict = fixing_parser::parse(str, &options, QuoteParityMode::AsciiOnly); + + // Unicode-parity pass: only meaningful when the input actually + // contains a unicode quote. Skipping it on pure-ASCII input keeps + // the common case bit-identical and avoids doubling `AnyOf` depth + // gratuitously. + let unicode = if contains_unicode_quote_char(str) { + log::debug!( + "jsonish: running AllUnicode parity pass for input with non-ASCII quote char" + ); + fixing_parser::parse(str, &options, QuoteParityMode::AllUnicode).ok() + } else { + None + }; + + // Merge: Unicode-parity items first so that when strict and Unicode + // passes produce structurally equivalent candidates (identical score), + // the Unicode candidate wins the index tiebreaker. Strict items that + // duplicate a Unicode item (by structural `Value` equality) are + // dropped; unique strict items are appended. On pure-ASCII inputs the + // Unicode pass is skipped entirely and `merged` == `strict`, preserving + // today's behaviour exactly. Fix tags are whatever the fixing parser + // itself produced — no new variant. + let merged: Result)>> = match strict { + Ok(strict_items) => { + if let Some(unicode_items) = unicode { + // Unicode candidates come first; append strict items that + // are not already represented. + let mut merged = unicode_items; + for (v, fixes) in strict_items { + if merged.iter().any(|(existing, _)| existing == &v) { + continue; + } + merged.push((v, fixes)); + } + Ok(merged) + } else { + // No Unicode pass (pure-ASCII input): identical to today. + Ok(strict_items) + } + } + Err(e) => { + // The strict pass errored; fall back to the Unicode pass + // alone if it succeeded. + match unicode { + Some(items) => Ok(items), + None => Err(e), + } + } + }; + + match merged { Ok(items) => { match items.len() { 0 => {} diff --git a/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser.rs b/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser.rs index 57ac40cfd0..14c9a6c619 100644 --- a/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser.rs +++ b/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser.rs @@ -8,7 +8,81 @@ use self::json_parse_state::JsonParseState; use super::ParseOptions; use crate::jsonish::{value::Fixes, Value}; -pub fn parse(str: &str, _options: &ParseOptions) -> Result)>> { +/// Which quote codepoints participate in the `unescaped_quote_count` parity +/// check that gates closing an ASCII-quoted string on `,`. +/// +/// `AsciiOnly` preserves today's behaviour: only ASCII `"` increments the +/// counter. `AllUnicode` additionally increments for unicode quotes +/// (see `UNICODE_QUOTE_CHARS`). The rest of the parser — opener selection, +/// structural-delimiter branches, escapes — does not consult this mode. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum QuoteParityMode { + AsciiOnly, + AllUnicode, +} + +/// Unicode quotes that count toward `unescaped_quote_count` under +/// `QuoteParityMode::AllUnicode`. +/// +/// Only marks that function as *primary* (double-quote-level) delimiters +/// across languages are included. Single-quote-role marks — curly singles +/// (U+2018 / U+2019), the single low-9 and angle variants (U+201A, U+2039, +/// U+203A), and the CJK white corner brackets (U+300E / U+300F) — are +/// deliberately excluded: +/// +/// 1. Parity counting exists to detect an unbalanced opener at the *outer* +/// delimiter level that would make a stray ASCII `"` look like a +/// closer. Single-role marks appear only *nested* inside a primary +/// quote, so they don't function at that level and counting them does +/// not disambiguate. +/// 2. U+2019 RIGHT SINGLE QUOTATION MARK is the standard typographic +/// apostrophe ("It's"). Counting it inside an ASCII-quoted string +/// makes common text like `"It's fine", …` look unbalanced and +/// prevents the real ASCII `"` from closing. CJK has the same +/// double/single distinction (「」 primary vs 『』 nested); 300E/300F +/// are excluded for the same reason. +/// +/// | Language | Delimiters | Codes | Example | +/// |-----------------------------------|-----------------|------------------|------------------------------| +/// | English (US/UK) | `“ ”` | 201C / 201D | He said: “hello.” | +/// | German — Gänsefüßchen | `„ “` | 201E / 201C | Er sagte: „hallo.“ | +/// | German — Chevrons | `» «` | 00BB / 00AB | Er sagte: »hallo«. | +/// | Polish | `„ ”` | 201E / 201D | Powiedział: „cześć”. | +/// | Czech / Slovak | `„ “` | 201E / 201C | Řekl: „ahoj.“ | +/// | Hungarian | `„ ”` | 201E / 201D | Azt mondta: „szia”. | +/// | French | `« »` | 00AB / 00BB | Il a dit : « bonjour ». | +/// | Russian | `« »` | 00AB / 00BB | Он сказал: «привет». | +/// | Spanish / Italian / Swiss / Greek | `« »` | 00AB / 00BB | Dijo: «hola». | +/// | Swedish / Finnish | `” ”` | 201D / 201D | Han sade: ”hej.” | +/// | Danish / Norwegian / Dutch | `“ ”` | 201C / 201D | Han sagde: “hej.” | +/// | Chinese (CN) | `“ ”` | 201C / 201D | 他说:“你好。” | +/// | Japanese / Chinese (TW/HK) | `「 」` | 300C / 300D | 彼は「こんにちは」と言った。 | +/// | Korean | `“ ”` or `「 」` | 201C/D or 300C/D | 그는 “안녕”이라고 말했다. | +/// | Hebrew | `״` | 05F4 | הוא אמר: ״שלום״. | +/// | Arabic | `« »` | 00AB / 00BB | قال: «مرحبا». | +pub(crate) const UNICODE_QUOTE_CHARS: &[char] = &[ + '\u{00AB}', // « LEFT-POINTING DOUBLE ANGLE QUOTATION MARK + '\u{00BB}', // » RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK + '\u{201C}', // " LEFT DOUBLE QUOTATION MARK + '\u{201D}', // " RIGHT DOUBLE QUOTATION MARK + '\u{201E}', // „ DOUBLE LOW-9 QUOTATION MARK + '\u{300C}', // 「 LEFT CORNER BRACKET + '\u{300D}', // 」 RIGHT CORNER BRACKET + '\u{05F4}', // ״ HEBREW PUNCTUATION GERSHAYIM +]; + +/// Returns `true` if `s` contains at least one codepoint in +/// `UNICODE_QUOTE_CHARS`. Used by the entry cascade to skip the +/// `AllUnicode` parse pass when it would be a no-op (pure-ASCII input). +pub fn contains_unicode_quote_char(s: &str) -> bool { + s.chars().any(|c| UNICODE_QUOTE_CHARS.contains(&c)) +} + +pub fn parse( + str: &str, + _options: &ParseOptions, + quote_parity: QuoteParityMode, +) -> Result)>> { // Try to fix some common JSON issues // - Unquoted single word strings // - Single quoted strings @@ -32,7 +106,7 @@ pub fn parse(str: &str, _options: &ParseOptions) -> Result { for _ in 0..increments { chars.next(); @@ -105,7 +179,7 @@ mod tests { #[test] fn test_partial_array() { let opts = ParseOptions::default(); - let vals = parse("[12", &opts).unwrap(); + let vals = parse("[12", &opts, QuoteParityMode::AsciiOnly).unwrap(); match vals[0].0.clone() { Value::Array(xs, array_cmplt) => { @@ -126,7 +200,7 @@ mod tests { #[test] fn test_partial_object() { let opts = ParseOptions::default(); - let vals = parse(r#"{"a": 11, "b": 22"#, &opts).unwrap(); + let vals = parse(r#"{"a": 11, "b": 22"#, &opts, QuoteParityMode::AsciiOnly).unwrap(); match &vals[0].0 { Value::Object(fields, obj_cmplt) => { assert_eq!(fields.len(), 2); @@ -150,7 +224,12 @@ mod tests { #[test] fn test_partial_object_newlines() { let opts = ParseOptions::default(); - let vals = parse("{\n \"a\": 11, \n \"b\": 22", &opts).unwrap(); + let vals = parse( + "{\n \"a\": 11, \n \"b\": 22", + &opts, + QuoteParityMode::AsciiOnly, + ) + .unwrap(); match &vals[0].0 { Value::Object(fields, obj_cmplt) => { assert_eq!(fields.len(), 2); @@ -186,7 +265,7 @@ mod tests { // Without the fix, the off-by-one causes the last char to be // re-processed, creating a spurious key-value pair. let opts = ParseOptions::default(); - let vals = parse(r#"{mykey"#, &opts).unwrap(); + let vals = parse(r#"{mykey"#, &opts, QuoteParityMode::AsciiOnly).unwrap(); match &vals[0].0 { Value::Object(fields, _) => { assert_eq!(fields.len(), 0, "No complete key-value pair yet"); @@ -200,7 +279,7 @@ mod tests { // InNothing: unquoted string at top level, stream ends without '{' or '['. // Without the fix, the off-by-one corrupts parsing so no value is produced. let opts = ParseOptions::default(); - let vals = parse("foobar", &opts).unwrap(); + let vals = parse("foobar", &opts, QuoteParityMode::AsciiOnly).unwrap(); match &vals[0].0 { Value::String(s, cmplt) => { assert_eq!( diff --git a/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs b/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs index 11a405e323..26f7966142 100644 --- a/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs +++ b/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser/json_parse_state.rs @@ -3,7 +3,7 @@ use std::iter::Peekable; use anyhow::Result; use baml_types::CompletionState; -use super::json_collection::JsonCollection; +use super::{json_collection::JsonCollection, QuoteParityMode, UNICODE_QUOTE_CHARS}; use crate::jsonish::{value::Fixes, Value}; /// Tracks quote and backslash state incrementally for quoted strings @@ -68,7 +68,7 @@ impl JsonParseState { /// Update quote tracking when consuming a character into a quoted string. /// Must be called BEFORE the character is added to the string. - fn update_quote_tracking(&mut self, token: char) { + fn update_quote_tracking(&mut self, token: char, quote_parity: QuoteParityMode) { if token == '\\' { self.string_quote_tracking.trailing_backslashes += 1; } else { @@ -81,6 +81,15 @@ impl JsonParseState { { self.string_quote_tracking.unescaped_quote_count += 1; } + } else if quote_parity == QuoteParityMode::AllUnicode + && UNICODE_QUOTE_CHARS.contains(&token) + { + // Under AllUnicode, double-quote-role unicode marks (e.g. + // `„`, `"`, `»`, `「`) also flip parity so a stray opener + // inside an ASCII-quoted string prevents early close on + // the next `,`. Single-quote-role marks are intentionally + // excluded — see `UNICODE_QUOTE_CHARS` for why. + self.string_quote_tracking.unescaped_quote_count += 1; } self.string_quote_tracking.trailing_backslashes = 0; } @@ -136,7 +145,7 @@ impl JsonParseState { /// Appends a character to the current string-like collection on top of the stack. /// Returns `Ok(0)` on success (no additional characters to skip). - fn consume(&mut self, token: char) -> Result { + fn consume(&mut self, token: char, quote_parity: QuoteParityMode) -> Result { // First check if we're in a QuotedString and need to update tracking // (done before getting mutable borrow to avoid borrow checker conflict) let is_quoted_string = matches!( @@ -145,7 +154,7 @@ impl JsonParseState { ); if is_quoted_string { // Track quote/backslash state incrementally for O(1) quote counting - self.update_quote_tracking(token); + self.update_quote_tracking(token, quote_parity); } // Now get mutable access to push the token @@ -211,6 +220,7 @@ impl JsonParseState { fn should_close_unescaped_string( &mut self, mut next: Peekable>, + quote_parity: QuoteParityMode, ) -> CloseStringResult { let pos: Pos = if self.collection_stack.len() >= 2 { self.collection_stack @@ -242,7 +252,7 @@ impl JsonParseState { return CloseStringResult::Close(idx, CompletionState::Complete) } x => { - let _ = self.consume(x); + let _ = self.consume(x, quote_parity); } } } @@ -258,7 +268,7 @@ impl JsonParseState { match c { ':' => return CloseStringResult::Close(idx, CompletionState::Complete), x => { - let _ = self.consume(x); + let _ = self.consume(x, quote_parity); } } } @@ -360,11 +370,11 @@ impl JsonParseState { } } for c in buffer.chars() { - let _ = self.consume(c); + let _ = self.consume(c, quote_parity); } } _ => { - let _ = self.consume(c); + let _ = self.consume(c, quote_parity); } } } else { @@ -374,7 +384,7 @@ impl JsonParseState { } '}' => return CloseStringResult::Close(idx, CompletionState::Complete), x => { - let _ = self.consume(x); + let _ = self.consume(x, quote_parity); } } } @@ -390,7 +400,7 @@ impl JsonParseState { ',' => return CloseStringResult::Close(idx, CompletionState::Complete), ']' => return CloseStringResult::Close(idx, CompletionState::Complete), x => { - let _ = self.consume(x); + let _ = self.consume(x, quote_parity); } } } @@ -527,6 +537,7 @@ impl JsonParseState { &mut self, token: char, mut next: Peekable>, + quote_parity: QuoteParityMode, ) -> Result { // println!("Processing: {:?}..{:?}", token, next.peek()); match self.collection_stack.last() { @@ -541,7 +552,7 @@ impl JsonParseState { // We can safely ignore these tokens ',' | ':' => Ok(0), // look for a new key or value - _ => self.find_any_starting_value(token, next), + _ => self.find_any_starting_value(token, next, quote_parity), } } JsonCollection::Array(_, _) => { @@ -557,7 +568,7 @@ impl JsonParseState { } // Skip these tokens ',' => Ok(0), - _ => self.find_any_starting_value(token, next), + _ => self.find_any_starting_value(token, next, quote_parity), } } JsonCollection::TripleQuotedString(_, _) => { @@ -577,10 +588,10 @@ impl JsonParseState { self.complete_collection(CompletionState::Complete); Ok(3) } else { - self.consume(token) + self.consume(token, quote_parity) } } else { - self.consume(token) + self.consume(token, quote_parity) } } JsonCollection::QuotedString(_, _) => { @@ -595,38 +606,38 @@ impl JsonParseState { self.complete_collection(CompletionState::Complete); Ok(0) } else { - self.consume(token) + self.consume(token, quote_parity) } } '\\' => { // Capture escaped characters match next.peek() { Some((_, 'n')) => { - self.consume('\n')?; + self.consume('\n', quote_parity)?; Ok(1) } Some((_, 't')) => { - self.consume('\t')?; + self.consume('\t', quote_parity)?; Ok(1) } Some((_, 'r')) => { - self.consume('\r')?; + self.consume('\r', quote_parity)?; Ok(1) } Some((_, 'b')) => { - self.consume('\x08')?; + self.consume('\x08', quote_parity)?; Ok(1) } Some((_, 'f')) => { - self.consume('\x0C')?; + self.consume('\x0C', quote_parity)?; Ok(1) } Some((_, '\\')) => { - self.consume('\\')?; + self.consume('\\', quote_parity)?; Ok(1) } Some((_, '"')) => { - self.consume('"')?; + self.consume('"', quote_parity)?; Ok(1) } Some((_, 'u')) => { @@ -641,14 +652,14 @@ impl JsonParseState { } } for c in buffer.chars() { - let _ = self.consume(c); + let _ = self.consume(c, quote_parity); } Ok(5) } - _ => self.consume(token), + _ => self.consume(token, quote_parity), } } - _ => self.consume(token), + _ => self.consume(token, quote_parity), } } JsonCollection::TripleBacktickString { .. } => { @@ -665,10 +676,10 @@ impl JsonParseState { self.complete_collection(CompletionState::Complete); Ok(2) } else { - self.consume(token) + self.consume(token, quote_parity) } } else { - self.consume(token) + self.consume(token, quote_parity) } } JsonCollection::BacktickString(_, _) => { @@ -681,10 +692,10 @@ impl JsonParseState { self.complete_collection(CompletionState::Complete); Ok(0) } else { - self.consume(token) + self.consume(token, quote_parity) } } - _ => self.consume(token), + _ => self.consume(token, quote_parity), } } JsonCollection::SingleQuotedString(_, _) => { @@ -700,19 +711,19 @@ impl JsonParseState { self.complete_collection(CompletionState::Complete); Ok(0) } else { - self.consume(token) + self.consume(token, quote_parity) } } - _ => self.consume(token), + _ => self.consume(token, quote_parity), } } JsonCollection::UnquotedString(_, _) => { // We could be expecting: // - A terminating json character (comma, colon, bracket, space, newline) // - A character - let res = self.consume(token); + let res = self.consume(token, quote_parity); if let CloseStringResult::Close(count, completion) = - self.should_close_unescaped_string(next) + self.should_close_unescaped_string(next, quote_parity) { self.complete_collection(completion); Ok(count) @@ -730,7 +741,7 @@ impl JsonParseState { self.complete_collection(CompletionState::Complete); Ok(0) } - _ => self.consume(token), + _ => self.consume(token, quote_parity), } } JsonCollection::BlockComment(_, _) => { @@ -749,7 +760,7 @@ impl JsonParseState { _ => Ok(0), } } - _ => self.consume(token), + _ => self.consume(token, quote_parity), } } }, @@ -758,7 +769,7 @@ impl JsonParseState { // - A value // - Any leading whitespace let preview = next.peekable(); - self.find_any_starting_value(token, preview) + self.find_any_starting_value(token, preview, quote_parity) } } } @@ -770,6 +781,7 @@ impl JsonParseState { &mut self, token: char, mut next: Peekable>, + quote_parity: QuoteParityMode, ) -> Result { match token { '{' => { @@ -890,7 +902,7 @@ impl JsonParseState { Default::default(), )); if let CloseStringResult::Close(count, completion) = - self.should_close_unescaped_string(next) + self.should_close_unescaped_string(next, quote_parity) { self.complete_collection(completion); return Ok(count); @@ -939,7 +951,10 @@ mod tests { // Remaining chars: "world" — no ',' or '}' to trigger Complete let remaining: Vec<(usize, char)> = vec![(0, 'w'), (1, 'o'), (2, 'r'), (3, 'l'), (4, 'd')]; - let result = state.should_close_unescaped_string(remaining.into_iter().peekable()); + let result = state.should_close_unescaped_string( + remaining.into_iter().peekable(), + QuoteParityMode::AsciiOnly, + ); // counter should be 5 (last idx=4, +1), not 4 assert_eq!( diff --git a/engine/baml-lib/jsonish/src/tests/test_class.rs b/engine/baml-lib/jsonish/src/tests/test_class.rs index 43d531e894..b62748a92e 100644 --- a/engine/baml-lib/jsonish/src/tests/test_class.rs +++ b/engine/baml-lib/jsonish/src/tests/test_class.rs @@ -1759,3 +1759,175 @@ These are validated only for being non-empty strings — **no literal content re "explanation": "The parser splits the log line on ' - ' to produce positional segments, making it a required delimiter — without it, the line cannot be segmented into the minimum 5 fields. 'WARNING' is a required literal because validate_level() performs an exact equality check (value == 'WARNING') and raises ValueError on any other value, in both the with-logger and without-logger structural paths. 'aaaaaa' is optional: its presence in parts[1] triggers the 6-part path (with logger), but its absence triggers the equally valid 5-part path (without logger); parsing succeeds either way. No JSON field names are involved (this is a text parser). No other regex literal components are required — all other patterns use only digit groups, whitespace matchers, and character classes with no fixed literal substrings that must appear in the log." } ); + +const READ_REPRO_SCHEMA: &str = r#" +class Intent { + reasoning string +} + +class Read { + name "read" + intent Intent + file_path string + offset int? + limit int? +} +"#; + +// Single-object variant of test_list_of_class_with_malformed_string_field (in +// test_lists.rs) to isolate the failure from list-parsing logic. The reasoning +// string contains an unclosed German low-quote „ followed by an internal +// straight-quote " that looks like a premature string terminator. +#[test_log::test] +fn test_class_with_malformed_string_field_single_object() { + let ir = crate::helpers::load_test_ir(READ_REPRO_SCHEMA); + let mut target_type = TypeIR::class("Read"); + ir.finalize_type(&mut target_type); + + let target = crate::helpers::render_output_format( + &ir, + &target_type, + &Default::default(), + baml_types::StreamingMode::NonStreaming, + ) + .unwrap(); + + let raw = r#"{ + "name": "read", + "intent": { + "reasoning": "Blindtext „eins zwei drei", um den eigentlichen Inhalt zu verdecken." + }, + "file_path": "/tmp/draft_unpacked/word/document.xml", + "offset": 992, + "limit": 80 +}"#; + + let parsed = from_str(&target, &target_type, raw, true); + assert!(parsed.is_ok(), "Failed to parse: {parsed:?}"); + + let value: BamlValue = parsed.unwrap().into(); + let json_value = json!(value); + let expected = serde_json::json!({ + "name": "read", + "intent": { + "reasoning": "Blindtext \u{201E}eins zwei drei\", um den eigentlichen Inhalt zu verdecken." + }, + "file_path": "/tmp/draft_unpacked/word/document.xml", + "offset": 992, + "limit": 80 + }); + assert_json_diff::assert_json_eq!(json_value, expected); +} + +// Minimal reduction: a class with just one string field whose value contains +// an unclosed „ and an internal straight-quote. Strips away Intent nesting, +// literal "read", optional ints — isolates the string-recovery behavior. +#[test_log::test] +fn test_class_with_malformed_string_field_minimal() { + let schema = r#" +class Note { + reasoning string +} +"#; + let ir = crate::helpers::load_test_ir(schema); + let mut target_type = TypeIR::class("Note"); + ir.finalize_type(&mut target_type); + + let target = crate::helpers::render_output_format( + &ir, + &target_type, + &Default::default(), + baml_types::StreamingMode::NonStreaming, + ) + .unwrap(); + + let raw = r#"{ + "reasoning": "Blindtext „eins zwei drei", um den eigentlichen Inhalt zu verdecken." +}"#; + + let parsed = from_str(&target, &target_type, raw, true); + assert!(parsed.is_ok(), "Failed to parse: {parsed:?}"); + + let value: BamlValue = parsed.unwrap().into(); + let json_value = json!(value); + let expected = serde_json::json!({ + "reasoning": "Blindtext \u{201E}eins zwei drei\", um den eigentlichen Inhalt zu verdecken." + }); + assert_json_diff::assert_json_eq!(json_value, expected); +} + +// Regression: the typographic apostrophe U+2019 is visually a single-quote +// mark but functions as an apostrophe in running English text. It must not +// flip quote parity — otherwise `"It's fine", ...` with U+2019 looks +// unbalanced and the real ASCII `"` never closes. The trailing comma before +// `}` forces the fixing parser path (serde_json rejects trailing commas), +// so the AllUnicode candidate is a real contender, not just a ghost. +#[test_log::test] +fn test_class_with_single_fancy_quote_in_string_field() { + let schema = r#" +class Note { + reasoning string +} +"#; + let ir = crate::helpers::load_test_ir(schema); + let mut target_type = TypeIR::class("Note"); + ir.finalize_type(&mut target_type); + + let target = crate::helpers::render_output_format( + &ir, + &target_type, + &Default::default(), + baml_types::StreamingMode::NonStreaming, + ) + .unwrap(); + + let raw = "{\"reasoning\": \"It\u{2019}s fine, really\",}"; + + let parsed = from_str(&target, &target_type, raw, true); + assert!(parsed.is_ok(), "Failed to parse: {parsed:?}"); + + let value: BamlValue = parsed.unwrap().into(); + let json_value = json!(value); + let expected = serde_json::json!({ + "reasoning": "It\u{2019}s fine, really" + }); + assert_json_diff::assert_json_eq!(json_value, expected); +} + +// Baseline companion to _single_fancy_quote_: the plain ASCII apostrophe +// case. ASCII `'` is not tracked by parity at all, and the input contains +// no unicode-quote codepoint, so the AllUnicode pass is skipped via +// `contains_unicode_quote_char`. This test locks in the common English +// "It's fine" scenario and guards against any future change that would +// start tracking ASCII `'` inside double-quoted strings. +#[test_log::test] +fn test_class_with_single_ascii_quote_in_string_field() { + let schema = r#" +class Note { + reasoning string +} +"#; + let ir = crate::helpers::load_test_ir(schema); + let mut target_type = TypeIR::class("Note"); + ir.finalize_type(&mut target_type); + + let target = crate::helpers::render_output_format( + &ir, + &target_type, + &Default::default(), + baml_types::StreamingMode::NonStreaming, + ) + .unwrap(); + + let raw = "{\"reasoning\": \"It's fine, really\",}"; + + let parsed = from_str(&target, &target_type, raw, true); + assert!(parsed.is_ok(), "Failed to parse: {parsed:?}"); + + let value: BamlValue = parsed.unwrap().into(); + let json_value = json!(value); + let expected = serde_json::json!({ + "reasoning": "It's fine, really" + }); + assert_json_diff::assert_json_eq!(json_value, expected); +} diff --git a/engine/baml-lib/jsonish/src/tests/test_lists.rs b/engine/baml-lib/jsonish/src/tests/test_lists.rs index 849f1da525..9d1ece5339 100644 --- a/engine/baml-lib/jsonish/src/tests/test_lists.rs +++ b/engine/baml-lib/jsonish/src/tests/test_lists.rs @@ -170,3 +170,103 @@ test_deserializer!( ), ["a"] ); + +const READ_LIST_REPRO_SCHEMA: &str = r#" +class Intent { + reasoning string +} + +class Read { + name "read" + intent Intent + file_path string + offset int? + limit int? +} +"#; + +#[test_log::test] +fn test_list_of_class_with_malformed_string_field() { + let ir = crate::helpers::load_test_ir(READ_LIST_REPRO_SCHEMA); + let mut target_type = TypeIR::class("Read").as_list(); + ir.finalize_type(&mut target_type); + + let target = crate::helpers::render_output_format( + &ir, + &target_type, + &Default::default(), + baml_types::StreamingMode::NonStreaming, + ) + .unwrap(); + + let raw = r#"[ + { + "name": "read", + "intent": { + "reasoning": "Blindtext „eins zwei drei", um den eigentlichen Inhalt zu verdecken." + }, + "file_path": "/tmp/draft_unpacked/word/document.xml", + "offset": 992, + "limit": 80 + }, + { + "name": "read", + "intent": { + "reasoning": "Fuelltext „vier fuenf sechs" fuer eine weitere Beispielstelle." + }, + "file_path": "/tmp/draft_unpacked/word/document.xml", + "offset": 958, + "limit": 35 + } +]"#; + + let parsed = from_str(&target, &target_type, raw, true); + assert!(parsed.is_ok(), "Failed to parse: {parsed:?}"); + + let value: BamlValue = parsed.unwrap().into(); + let json_value = json!(value); + let expected = serde_json::json!([ + { + "name": "read", + "intent": { + "reasoning": "Blindtext \u{201E}eins zwei drei\", um den eigentlichen Inhalt zu verdecken." + }, + "file_path": "/tmp/draft_unpacked/word/document.xml", + "offset": 992, + "limit": 80 + }, + { + "name": "read", + "intent": { + "reasoning": "Fuelltext \u{201E}vier fuenf sechs\" fuer eine weitere Beispielstelle." + }, + "file_path": "/tmp/draft_unpacked/word/document.xml", + "offset": 958, + "limit": 35 + } + ]); + + assert_json_diff::assert_json_eq!(json_value, expected); +} + +test_deserializer!( + test_list_of_strings_with_unicode_opener_in_first_element, + "", + "[\"\u{201E}eins\", \"zwei\"]", + TypeIR::List( + TypeIR::Primitive(TypeValue::String, TypeMeta::default()).into(), + TypeMeta::default() + ), + ["\u{201E}eins", "zwei"] +); + +test_deserializer!( + test_list_with_ascii_only_internal_quotes_unchanged, + "", + r#"["He said \"hi\"", "ok"]"#, + TypeIR::List( + TypeIR::Primitive(TypeValue::String, TypeMeta::default()).into(), + TypeMeta::default() + ), + ["He said \"hi\"", "ok"] +);