From b49e137c93c52e71adf515511b44494cc174a993 Mon Sep 17 00:00:00 2001 From: Anthony Green Date: Fri, 21 Nov 2025 18:30:16 -0500 Subject: [PATCH] Fix UTF-16 surrogate pair validation The UTF-16 decoder was incorrectly accepting low surrogates (#xdc00-#xdfff) as the first word of a surrogate pair. According to the UTF-16 specification, only high surrogates (#xd800-#xdbff) are valid as the first word. This fix changes the validation in both UTF-16 LE and BE decoders to only accept the correct range of high surrogates, properly rejecting invalid sequences where low surrogates appear as the first word. Also added comprehensive test cases to verify that low surrogate pairs are correctly rejected with encoding errors. --- decode.lisp | 4 ++-- test/test.lisp | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/decode.lisp b/decode.lisp index 17be1bd..7565478 100644 --- a/decode.lisp +++ b/decode.lisp @@ -363,7 +363,7 @@ the form which is used to obtain the next octet." (declare (inline read-next-word)) (let ((word (read-next-word))) (declare (type (unsigned-byte 16) word)) - (cond ((<= #xd800 word #xdfff) + (cond ((<= #xd800 word #xdbff) (let ((next-word (read-next-word))) (declare (type (unsigned-byte 16) next-word)) (unless (<= #xdc00 next-word #xdfff) @@ -394,7 +394,7 @@ the form which is used to obtain the next octet." (declare (inline read-next-word)) (let ((word (read-next-word))) (declare (type (unsigned-byte 16) word)) - (cond ((<= #xd800 word #xdfff) + (cond ((<= #xd800 word #xdbff) (let ((next-word (read-next-word))) (declare (type (unsigned-byte 16) next-word)) (unless (<= #xdc00 next-word #xdfff) diff --git a/test/test.lisp b/test/test.lisp index 1d88641..06dca6f 100644 --- a/test/test.lisp +++ b/test/test.lisp @@ -558,7 +558,21 @@ the external format EXTERNAL-FORMAT." (format t "~&Illegal code points")) (want-encoding-error #(#x00 #x00 #x11 #x00) :utf-32le) (want-encoding-error #(#x00 #xd8) :utf-16le) - (want-encoding-error #(#xff #xdf) :utf-16le)) + (want-encoding-error #(#xff #xdf) :utf-16le) + (when verbose + (format t "~&Low surrogates cannot be the first word of a surrogate pair")) + ;; The bug: decode.lisp incorrectly accepts low surrogates (#xdc00-#xdfff) + ;; as the first word. It should only accept high surrogates (#xd800-#xdbff). + ;; These tests verify that low surrogates followed by valid low surrogates + ;; are properly rejected (not incorrectly decoded as surrogate pairs). + (want-encoding-error #(#x00 #xdc #x00 #xdc) :utf-16le) + (want-encoding-error #(#x00 #xdc #xff #xdf) :utf-16le) + (want-encoding-error #(#xff #xdc #x00 #xdc) :utf-16le) + (want-encoding-error #(#x00 #xdd #x00 #xdd) :utf-16le) + (want-encoding-error #(#xff #xdf #xff #xdf) :utf-16le) + (want-encoding-error #(#xdc #x00 #xdc #x00) :utf-16be) + (want-encoding-error #(#xdf #xff #xdf #xff) :utf-16be) + (want-encoding-error #(#x00 #xdc #x41 #x00) :utf-16le)) (macrolet ((want-encoding-error (input format) `(with-expected-error (external-format-encoding-error) (read-flexi-line* ,input ,format))))