Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/microsoft-fast-build/DESIGN.md
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ A hand-rolled recursive-descent parser. No external crates.
| `n` | `null` |
| `-` / digit | `parse_number` |

`parse_string` handles `\"`, `\\`, `\/`, `\b`, `\f`, `\n`, `\r`, `\t`, and `\uXXXX` Unicode escapes.
`parse_string` handles `\"`, `\\`, `\/`, `\b`, `\f`, `\n`, `\r`, `\t`, and `\uXXXX` Unicode escapes. Non-ASCII literal characters (e.g. emoji, accented letters) are decoded as multi-byte UTF-8 sequences β€” the full byte sequence for each code point is consumed before advancing, avoiding the corruption that would result from casting individual bytes to `char`.

`parse_number` handles integer and decimal forms.

Expand Down
53 changes: 51 additions & 2 deletions crates/microsoft-fast-build/src/json.rs
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,23 @@ fn parse_string(input: &str) -> Result<(String, &str), JsonError> {
i += 1;
}
b => {
s.push(b as char);
i += 1;
// For ASCII bytes, cast directly to char.
// For multi-byte UTF-8 sequences (first byte >= 0x80), decode the
// full sequence β€” casting each byte independently corrupts non-ASCII
// characters such as emoji (e.g. ⭐ U+2B50 would become Ò + garbage).
if b < 0x80 {
s.push(b as char);
i += 1;
} else {
let seq_len = if b < 0xE0 { 2 } else if b < 0xF0 { 3 } else { 4 };
if i + seq_len > bytes.len() {
return Err(JsonError { message: "Invalid UTF-8 sequence in string".to_string() });
}
let ch_str = std::str::from_utf8(&bytes[i..i + seq_len])
.map_err(|_| JsonError { message: "Invalid UTF-8 sequence in string".to_string() })?;
s.push_str(ch_str);
i += seq_len;
}
}
}
}
Expand Down Expand Up @@ -278,6 +293,40 @@ mod tests {
}
}

#[test]
fn test_parse_string_with_emoji() {
// Multi-byte UTF-8: ⭐ is U+2B50 (3 bytes: 0xE2 0xAD 0x90).
// Casting each byte to char independently would produce Γ’ + control chars.
let v = parse(r#""⭐ SELECTED""#).unwrap();
if let JsonValue::String(s) = v {
assert_eq!(s, "⭐ SELECTED");
} else {
panic!("Expected string");
}
}

#[test]
fn test_parse_string_with_multibyte_chars() {
// 2-byte: Γ© (U+00E9), 3-byte: βœ“ (U+2713), 4-byte: πŸŽ‰ (U+1F389)
let v = parse(r#""cafΓ© βœ“ πŸŽ‰""#).unwrap();
if let JsonValue::String(s) = v {
assert_eq!(s, "cafΓ© βœ“ πŸŽ‰");
} else {
panic!("Expected string");
}
}

#[test]
fn test_parse_string_emoji_in_object() {
let v = parse(r#"{"label": "⭐ star", "emoji": "πŸŽ‰"}"#).unwrap();
if let JsonValue::Object(map) = v {
assert_eq!(map["label"].to_display_string(), "⭐ star");
assert_eq!(map["emoji"].to_display_string(), "πŸŽ‰");
} else {
panic!("Expected object");
}
}

#[test]
fn test_parse_number() {
let v = parse("3.14").unwrap();
Expand Down
21 changes: 21 additions & 0 deletions crates/microsoft-fast-build/tests/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,3 +126,24 @@ fn test_array_index_in_f_repeat_item() {
"<span>a</span><span>c</span>",
);
}

// ── non-ASCII / multi-byte UTF-8 content bindings ─────────────────────────────

#[test]
fn test_binding_emoji_from_state() {
assert_eq!(ok("{{label}}", r#"{"label": "⭐ star"}"#), "⭐ star");
}

#[test]
fn test_binding_emoji_literal_in_template() {
// Emoji in template literal text (not a binding) must be preserved verbatim.
assert_eq!(
ok("<span>⭐ {{name}}</span>", r#"{"name": "Alice"}"#),
"<span>⭐ Alice</span>",
);
}

#[test]
fn test_binding_multibyte_chars() {
assert_eq!(ok("{{v}}", r#"{"v": "cafΓ© βœ“ πŸŽ‰"}"#), "cafΓ© βœ“ πŸŽ‰");
}
Loading