Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 43 additions & 38 deletions core/runtime/src/text/encodings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,61 +21,66 @@ pub(crate) mod utf8 {
}
}

/// Decodes an iterator of UTF-16 code units into a well-formed `JsString`,
/// replacing any unpaired surrogates with U+FFFD.
///
/// If `dangling_byte` is true and the last decoded code unit is not a high
/// surrogate (which would already have been replaced), an additional U+FFFD
/// is appended for the truncated trailing byte.
fn decode_utf16_units(
code_units: impl IntoIterator<Item = u16>,
dangling_byte: bool,
) -> boa_engine::JsString {
let mut string = String::new();
let mut last_code_unit = None;
string.extend(
std::char::decode_utf16(code_units.into_iter().inspect(|code_unit| {
last_code_unit = Some(*code_unit);
}))
.map(|result| result.unwrap_or('\u{FFFD}')),
);
let trailing_high_surrogate =
last_code_unit.is_some_and(|code_unit| (0xD800..=0xDBFF).contains(&code_unit));
if dangling_byte && !trailing_high_surrogate {
string.push('\u{FFFD}');
}
boa_engine::JsString::from(string)
}

pub(crate) mod utf16le {
use boa_engine::{JsString, js_string};
use boa_engine::JsString;

pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
if strip_bom {
input = input.strip_prefix(&[0xFF, 0xFE]).unwrap_or(input);
}

// After this point, input is of even length.
let dangling = if input.len().is_multiple_of(2) {
false
} else {
let dangling_byte = !input.len().is_multiple_of(2);
if dangling_byte {
input = &input[0..input.len() - 1];
true
};

let input: &[u16] = bytemuck::cast_slice(input);

if dangling {
JsString::from(&[JsString::from(input), js_string!("\u{FFFD}")])
} else {
JsString::from(input)
}

let code_units: &[u16] = bytemuck::cast_slice(input);
super::decode_utf16_units(code_units.iter().copied(), dangling_byte)
}
}

pub(crate) mod utf16be {
use boa_engine::{JsString, js_string};
use boa_engine::JsString;

pub(crate) fn decode(mut input: Vec<u8>, strip_bom: bool) -> JsString {
if strip_bom && input.starts_with(&[0xFE, 0xFF]) {
input.drain(..2);
pub(crate) fn decode(mut input: &[u8], strip_bom: bool) -> JsString {
if strip_bom && let Some(rest) = input.strip_prefix(&[0xFE, 0xFF]) {
input = rest;
}

let mut input = input.as_mut_slice();
// After this point, input is of even length.
let dangling = if input.len().is_multiple_of(2) {
false
} else {
let new_len = input.len() - 1;
input = &mut input[0..new_len];
true
};

let input: &mut [u16] = bytemuck::cast_slice_mut(input);

// Swap the bytes.
for b in &mut *input {
*b = b.swap_bytes();
let dangling_byte = !input.len().is_multiple_of(2);
if dangling_byte {
input = &input[0..input.len() - 1];
}

if dangling {
JsString::from(&[JsString::from(&*input), js_string!("\u{FFFD}")])
} else {
JsString::from(&*input)
}
let code_units = input
.chunks_exact(2)
.map(|pair| u16::from_be_bytes([pair[0], pair[1]]));
super::decode_utf16_units(code_units, dangling_byte)
}
}
5 changes: 1 addition & 4 deletions core/runtime/src/text/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,7 @@ impl TextDecoder {
Ok(match self.encoding {
Encoding::Utf8 => encodings::utf8::decode(data, strip_bom),
Encoding::Utf16Le => encodings::utf16le::decode(data, strip_bom),
Encoding::Utf16Be => {
let owned = data.to_vec();
encodings::utf16be::decode(owned, strip_bom)
}
Encoding::Utf16Be => encodings::utf16be::decode(data, strip_bom),
})
}
}
Expand Down
66 changes: 66 additions & 0 deletions core/runtime/src/text/tests.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use super::encodings;
use crate::test::{TestAction, run_test_actions_with};
use crate::text;
use boa_engine::object::builtins::JsUint8Array;
Expand Down Expand Up @@ -476,3 +477,68 @@ fn decoder_handle_data_view_offset_and_length() {
context,
);
}

// Test cases from issue #4612: unpaired surrogates must be replaced with U+FFFD.
const INVALID_UTF16_CASES: &[(&[u16], &[u16])] = &[
// Lone high surrogate in the middle
(
&[0x0061, 0x0062, 0xD800, 0x0077, 0x0078],
&[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
),
// Lone high surrogate only
(&[0xD800], &[0xFFFD]),
// Two consecutive high surrogates
(&[0xD800, 0xD800], &[0xFFFD, 0xFFFD]),
// Lone low surrogate in the middle
(
&[0x0061, 0x0062, 0xDFFF, 0x0077, 0x0078],
&[0x0061, 0x0062, 0xFFFD, 0x0077, 0x0078],
),
// Low surrogate followed by high surrogate (wrong order)
(&[0xDFFF, 0xD800], &[0xFFFD, 0xFFFD]),
];

#[test]
fn decoder_utf16le_replaces_unpaired_surrogates() {
for (invalid, replaced) in INVALID_UTF16_CASES {
let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
for &code_unit in *invalid {
input_bytes.extend_from_slice(&code_unit.to_le_bytes());
}

let result = encodings::utf16le::decode(&input_bytes, false);
let expected = JsString::from(*replaced);
assert_eq!(result, expected, "utf16le failed for input {invalid:?}");
}
}

#[test]
fn decoder_utf16be_replaces_unpaired_surrogates() {
for (invalid, replaced) in INVALID_UTF16_CASES {
let mut input_bytes = Vec::with_capacity(invalid.len() * 2);
for &code_unit in *invalid {
input_bytes.extend_from_slice(&code_unit.to_be_bytes());
}

let result = encodings::utf16be::decode(&input_bytes, false);
let expected = JsString::from(*replaced);
assert_eq!(result, expected, "utf16be failed for input {invalid:?}");
}
}

#[test]
fn decoder_utf16le_dangling_byte_produces_replacement() {
// Odd-length input: the last byte is truncated and replaced with U+FFFD
let input: &[u8] = &[0x41, 0x00, 0x42]; // 'A' (LE) + dangling byte
let result = encodings::utf16le::decode(input, false);
let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
assert_eq!(result, expected);
}

#[test]
fn decoder_utf16be_dangling_byte_produces_replacement() {
let input: &[u8] = &[0x00, 0x41, 0x42]; // 'A' (BE) + dangling byte
let result = encodings::utf16be::decode(input, false);
let expected = JsString::from(&[0x0041u16, 0xFFFD][..]);
assert_eq!(result, expected);
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need these tests? Might be better to enable the utf16 decoding tests from the wpt suite instead, since those basically compile to the same kind of tests

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh yes, I've pushed a follow-up commit that enables the upstream WPT suite instead. Specifically:

->Enabled encoding/textdecoder-utf16-surrogates.any.js in tests/wpt/src/lib.rs, which covers both the replacement-mode cases and the fatal: true cases.

->Since the WPT test also exercises fatal, I implemented the fatal option on TextDecoder: added it to TextDecoderOptions, stored it on the struct, exposed the fatal

->getter, and threaded it through the decode helpers so decode() returns a TypeError on invalid input instead of inserting U+FFFD.

->Dropped the custom INVALID_UTF16_CASES Rust tests since the WPT file now covers the same ground.

Loading