Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/microsoft-fast-build/DESIGN.md
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ A hand-rolled recursive-descent parser. No external crates.
| `n` | `null` |
| `-` / digit | `parse_number` |

`parse_string` handles `\"`, `\\`, `\/`, `\b`, `\f`, `\n`, `\r`, `\t`, and `\uXXXX` Unicode escapes.
`parse_string` handles `\"`, `\\`, `\/`, `\b`, `\f`, `\n`, `\r`, `\t`, and `\uXXXX` Unicode escapes. Non-ASCII literal characters (e.g. emoji, accented letters) are decoded as multi-byte UTF-8 sequences β€” the full byte sequence for each code point is consumed before advancing, avoiding the corruption that would result from casting individual bytes to `char`.

`parse_number` handles integer and decimal forms.

Expand Down
54 changes: 52 additions & 2 deletions crates/microsoft-fast-build/src/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,24 @@ fn parse_string(input: &str) -> Result<(String, &str), JsonError> {
i += 1;
}
b => {
s.push(b as char);
i += 1;
// For ASCII bytes, cast directly to char.
// For multi-byte UTF-8 sequences (first byte >= 0x80), decode the
// next character from the remaining bytes and advance by its actual
// UTF-8 length. This avoids inferring sequence width from lead-byte
// ranges that also include continuation or otherwise invalid bytes.
if b < 0x80 {
s.push(b as char);
i += 1;
} else {
let remaining = std::str::from_utf8(&bytes[i..]).map_err(|_| JsonError {
message: "Invalid UTF-8 sequence in string".to_string(),
})?;
let ch = remaining.chars().next().ok_or_else(|| JsonError {
message: "Invalid UTF-8 sequence in string".to_string(),
})?;
s.push(ch);
i += ch.len_utf8();
}
}
}
}
Expand Down Expand Up @@ -278,6 +294,40 @@ mod tests {
}
}

#[test]
fn test_parse_string_with_emoji() {
// Multi-byte UTF-8: ⭐ is U+2B50 (3 bytes: 0xE2 0xAD 0x90).
// Casting each byte to char independently would produce Γ’ + control chars.
let v = parse(r#""⭐ SELECTED""#).unwrap();
if let JsonValue::String(s) = v {
assert_eq!(s, "⭐ SELECTED");
} else {
panic!("Expected string");
}
}

#[test]
fn test_parse_string_with_multibyte_chars() {
// 2-byte: Γ© (U+00E9), 3-byte: βœ“ (U+2713), 4-byte: πŸŽ‰ (U+1F389)
let v = parse(r#""cafΓ© βœ“ πŸŽ‰""#).unwrap();
if let JsonValue::String(s) = v {
assert_eq!(s, "cafΓ© βœ“ πŸŽ‰");
} else {
panic!("Expected string");
}
}

#[test]
fn test_parse_string_emoji_in_object() {
let v = parse(r#"{"label": "⭐ star", "emoji": "πŸŽ‰"}"#).unwrap();
if let JsonValue::Object(map) = v {
assert_eq!(map["label"].to_display_string(), "⭐ star");
assert_eq!(map["emoji"].to_display_string(), "πŸŽ‰");
} else {
panic!("Expected object");
}
}

#[test]
fn test_parse_number() {
let v = parse("3.14").unwrap();
Expand Down
21 changes: 21 additions & 0 deletions crates/microsoft-fast-build/tests/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,24 @@ fn test_array_index_in_f_repeat_item() {
"<span>a</span><span>c</span>",
);
}

// ── non-ASCII / multi-byte UTF-8 content bindings ─────────────────────────────

#[test]
fn test_binding_emoji_from_state() {
assert_eq!(ok("{{label}}", r#"{"label": "⭐ star"}"#), "⭐ star");
}

#[test]
fn test_binding_emoji_literal_in_template() {
// Emoji in template literal text (not a binding) must be preserved verbatim.
assert_eq!(
ok("<span>⭐ {{name}}</span>", r#"{"name": "Alice"}"#),
"<span>⭐ Alice</span>",
);
}

#[test]
fn test_binding_multibyte_chars() {
assert_eq!(ok("{{v}}", r#"{"v": "cafΓ© βœ“ πŸŽ‰"}"#), "cafΓ© βœ“ πŸŽ‰");
}
Loading