BoundaryML · sxlijin · Apr 17, 2026
diff --git a/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs b/engine/baml-lib/jsonish/src/jsonish/parser/entry.rs
@@ -4,7 +4,7 @@ use baml_types::CompletionState;
 use super::ParseOptions;
 use crate::jsonish::{
     parser::{
-        fixing_parser,
+        fixing_parser::{self, contains_unicode_quote_char, QuoteParityMode},
         markdown_parser::{self, MarkdownResult},
         multi_json_parser,
     },
@@ -174,7 +174,60 @@ pub(super) fn parse_func(str: &str, mut options: ParseOptions, is_done: bool) ->
     }
 
     if options.allow_fixes {
-        match fixing_parser::parse(str, &options) {
+        // Strict pass: today's behaviour. Every ASCII-only input returns
+        // identical results to before this change.
+        let strict = fixing_parser::parse(str, &options, QuoteParityMode::AsciiOnly);
+
+        // Unicode-parity pass: only meaningful when the input actually
+        // contains a unicode quote. Skipping it on pure-ASCII input keeps
+        // the common case bit-identical and avoids doubling `AnyOf` depth
+        // gratuitously.
+        let unicode = if contains_unicode_quote_char(str) {
+            log::debug!(
+                "jsonish: running AllUnicode parity pass for input with non-ASCII quote char"
+            );
+            fixing_parser::parse(str, &options, QuoteParityMode::AllUnicode).ok()
+        } else {
+            None
+        };
+
+        // Merge: Unicode-parity items first so that when strict and Unicode
+        // passes produce structurally equivalent candidates (identical score),
+        // the Unicode candidate wins the index tiebreaker. Strict items that
+        // duplicate a Unicode item (by structural `Value` equality) are
+        // dropped; unique strict items are appended. On pure-ASCII inputs the
+        // Unicode pass is skipped entirely and `merged` == `strict`, preserving
+        // today's behaviour exactly. Fix tags are whatever the fixing parser
+        // itself produced — no new variant.
+        let merged: Result<Vec<(Value, Vec<Fixes>)>> = match strict {
+            Ok(strict_items) => {
+                if let Some(unicode_items) = unicode {
+                    // Unicode candidates come first; append strict items that
+                    // are not already represented.
+                    let mut merged = unicode_items;
+                    for (v, fixes) in strict_items {
+                        if merged.iter().any(|(existing, _)| existing == &v) {
+                            continue;
+                        }
+                        merged.push((v, fixes));
+                    }
+                    Ok(merged)
+                } else {
+                    // No Unicode pass (pure-ASCII input): identical to today.
+                    Ok(strict_items)
+                }
+            }
+            Err(e) => {
+                // The strict pass errored; fall back to the Unicode pass
+                // alone if it succeeded.
+                match unicode {
+                    Some(items) => Ok(items),
+                    None => Err(e),
+                }
+            }
+        };
+
+        match merged {
             Ok(items) => {
                 match items.len() {
                     0 => {}

diff --git a/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser.rs b/engine/baml-lib/jsonish/src/jsonish/parser/fixing_parser.rs
@@ -8,7 +8,81 @@ use self::json_parse_state::JsonParseState;
 use super::ParseOptions;
 use crate::jsonish::{value::Fixes, Value};
 
-pub fn parse(str: &str, _options: &ParseOptions) -> Result<Vec<(Value, Vec<Fixes>)>> {
+/// Which quote codepoints participate in the `unescaped_quote_count` parity
+/// check that gates closing an ASCII-quoted string on `,`.
+///
+/// `AsciiOnly` preserves today's behaviour: only ASCII `"` increments the
+/// counter. `AllUnicode` additionally increments for unicode quotes
+/// (see `UNICODE_QUOTE_CHARS`). The rest of the parser — opener selection,
+/// structural-delimiter branches, escapes — does not consult this mode.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum QuoteParityMode {
+    AsciiOnly,
+    AllUnicode,
+}
+
+/// Unicode quotes that count toward `unescaped_quote_count` under
+/// `QuoteParityMode::AllUnicode`.
+///
+/// Only marks that function as *primary* (double-quote-level) delimiters
+/// across languages are included. Single-quote-role marks — curly singles
+/// (U+2018 / U+2019), the single low-9 and angle variants (U+201A, U+2039,
+/// U+203A), and the CJK white corner brackets (U+300E / U+300F) — are
+/// deliberately excluded:
+///
+/// 1. Parity counting exists to detect an unbalanced opener at the *outer*
+///    delimiter level that would make a stray ASCII `"` look like a
+///    closer. Single-role marks appear only *nested* inside a primary
+///    quote, so they don't function at that level and counting them does
+///    not disambiguate.
+/// 2. U+2019 RIGHT SINGLE QUOTATION MARK is the standard typographic
+///    apostrophe ("It's"). Counting it inside an ASCII-quoted string
+///    makes common text like `"It's fine", …` look unbalanced and
+///    prevents the real ASCII `"` from closing. CJK has the same
+///    double/single distinction (「」 primary vs 『』 nested); 300E/300F
+///    are excluded for the same reason.
+///
+/// | Language                          | Delimiters      | Codes            | Example                      |
+/// |-----------------------------------|-----------------|------------------|------------------------------|
+/// | English (US/UK)                   | `“ ”`           | 201C / 201D      | He said: “hello.”            |
+/// | German — Gänsefüßchen             | `„ “`           | 201E / 201C      | Er sagte: „hallo.“           |
+/// | German — Chevrons                 | `» «`           | 00BB / 00AB      | Er sagte: »hallo«.           |
+/// | Polish                            | `„ ”`           | 201E / 201D      | Powiedział: „cześć”.         |
+/// | Czech / Slovak                    | `„ “`           | 201E / 201C      | Řekl: „ahoj.“                |
+/// | Hungarian                         | `„ ”`           | 201E / 201D      | Azt mondta: „szia”.          |
+/// | French                            | `« »`           | 00AB / 00BB      | Il a dit : « bonjour ».      |
+/// | Russian                           | `« »`           | 00AB / 00BB      | Он сказал: «привет».         |
+/// | Spanish / Italian / Swiss / Greek | `« »`           | 00AB / 00BB      | Dijo: «hola».                |
+/// | Swedish / Finnish                 | `” ”`           | 201D / 201D      | Han sade: ”hej.”             |
+/// | Danish / Norwegian / Dutch        | `“ ”`           | 201C / 201D      | Han sagde: “hej.”            |
+/// | Chinese (CN)                      | `“ ”`           | 201C / 201D      | 他说：“你好。”                 |
+/// | Japanese / Chinese (TW/HK)        | `「 」`         | 300C / 300D      | 彼は「こんにちは」と言った。    |
+/// | Korean                            | `“ ”` or `「 」` | 201C/D or 300C/D | 그는 “안녕”이라고 말했다.      |
+/// | Hebrew                            | `״`             | 05F4             | הוא אמר: ״שלום״.              |
+/// | Arabic                            | `« »`           | 00AB / 00BB      | قال: «مرحبا».                |
+pub(crate) const UNICODE_QUOTE_CHARS: &[char] = &[
+    '\u{00AB}', // «  LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
+    '\u{00BB}', // »  RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
+    '\u{201C}', // "  LEFT DOUBLE QUOTATION MARK
+    '\u{201D}', // "  RIGHT DOUBLE QUOTATION MARK
+    '\u{201E}', // „  DOUBLE LOW-9 QUOTATION MARK
+    '\u{300C}', // 「 LEFT CORNER BRACKET
+    '\u{300D}', // 」 RIGHT CORNER BRACKET
+    '\u{05F4}', // ״  HEBREW PUNCTUATION GERSHAYIM
+];
+
+/// Returns `true` if `s` contains at least one codepoint in
+/// `UNICODE_QUOTE_CHARS`. Used by the entry cascade to skip the
+/// `AllUnicode` parse pass when it would be a no-op (pure-ASCII input).
+pub fn contains_unicode_quote_char(s: &str) -> bool {
+    s.chars().any(|c| UNICODE_QUOTE_CHARS.contains(&c))
+}
+
+pub fn parse(
+    str: &str,
+    _options: &ParseOptions,
+    quote_parity: QuoteParityMode,
+) -> Result<Vec<(Value, Vec<Fixes>)>> {
     // Try to fix some common JSON issues
     // - Unquoted single word strings
     // - Single quoted strings
@@ -32,7 +106,7 @@ pub fn parse(str: &str, _options: &ParseOptions) -> Result<Vec<(Value, Vec<Fixes
     let mut chars = str.char_indices().peekable();
     while let Some((count, c)) = chars.next() {
         let peekable = str[count + c.len_utf8()..].char_indices().peekable();
-        match state.process_token(c, peekable) {
+        match state.process_token(c, peekable, quote_parity) {
             Ok(increments) => {
                 for _ in 0..increments {
                     chars.next();
@@ -105,7 +179,7 @@ mod tests {
     #[test]
     fn test_partial_array() {
         let opts = ParseOptions::default();
-        let vals = parse("[12", &opts).unwrap();
+        let vals = parse("[12", &opts, QuoteParityMode::AsciiOnly).unwrap();
 
         match vals[0].0.clone() {
             Value::Array(xs, array_cmplt) => {
@@ -126,7 +200,7 @@ mod tests {
     #[test]
     fn test_partial_object() {
         let opts = ParseOptions::default();
-        let vals = parse(r#"{"a": 11, "b": 22"#, &opts).unwrap();
+        let vals = parse(r#"{"a": 11, "b": 22"#, &opts, QuoteParityMode::AsciiOnly).unwrap();
         match &vals[0].0 {
             Value::Object(fields, obj_cmplt) => {
                 assert_eq!(fields.len(), 2);
@@ -150,7 +224,12 @@ mod tests {
     #[test]
     fn test_partial_object_newlines() {
         let opts = ParseOptions::default();
-        let vals = parse("{\n \"a\": 11, \n \"b\": 22", &opts).unwrap();
+        let vals = parse(
+            "{\n \"a\": 11, \n \"b\": 22",
+            &opts,
+            QuoteParityMode::AsciiOnly,
+        )
+        .unwrap();
         match &vals[0].0 {
             Value::Object(fields, obj_cmplt) => {
                 assert_eq!(fields.len(), 2);
@@ -186,7 +265,7 @@ mod tests {
         // Without the fix, the off-by-one causes the last char to be
         // re-processed, creating a spurious key-value pair.
         let opts = ParseOptions::default();
-        let vals = parse(r#"{mykey"#, &opts).unwrap();
+        let vals = parse(r#"{mykey"#, &opts, QuoteParityMode::AsciiOnly).unwrap();
         match &vals[0].0 {
             Value::Object(fields, _) => {
                 assert_eq!(fields.len(), 0, "No complete key-value pair yet");
@@ -200,7 +279,7 @@ mod tests {
         // InNothing: unquoted string at top level, stream ends without '{' or '['.
         // Without the fix, the off-by-one corrupts parsing so no value is produced.
         let opts = ParseOptions::default();
-        let vals = parse("foobar", &opts).unwrap();
+        let vals = parse("foobar", &opts, QuoteParityMode::AsciiOnly).unwrap();
         match &vals[0].0 {
             Value::String(s, cmplt) => {
                 assert_eq!(