diff --git a/decode.lisp b/decode.lisp index 17be1bd..7565478 100644 --- a/decode.lisp +++ b/decode.lisp @@ -363,7 +363,7 @@ the form which is used to obtain the next octet." (declare (inline read-next-word)) (let ((word (read-next-word))) (declare (type (unsigned-byte 16) word)) - (cond ((<= #xd800 word #xdfff) + (cond ((<= #xd800 word #xdbff) (let ((next-word (read-next-word))) (declare (type (unsigned-byte 16) next-word)) (unless (<= #xdc00 next-word #xdfff) @@ -394,7 +394,7 @@ the form which is used to obtain the next octet." (declare (inline read-next-word)) (let ((word (read-next-word))) (declare (type (unsigned-byte 16) word)) - (cond ((<= #xd800 word #xdfff) + (cond ((<= #xd800 word #xdbff) (let ((next-word (read-next-word))) (declare (type (unsigned-byte 16) next-word)) (unless (<= #xdc00 next-word #xdfff) diff --git a/test/test.lisp b/test/test.lisp index 1d88641..06dca6f 100644 --- a/test/test.lisp +++ b/test/test.lisp @@ -558,7 +558,21 @@ the external format EXTERNAL-FORMAT." (format t "~&Illegal code points")) (want-encoding-error #(#x00 #x00 #x11 #x00) :utf-32le) (want-encoding-error #(#x00 #xd8) :utf-16le) - (want-encoding-error #(#xff #xdf) :utf-16le)) + (want-encoding-error #(#xff #xdf) :utf-16le) + (when verbose + (format t "~&Low surrogates cannot be the first word of a surrogate pair")) + ;; The bug: decode.lisp incorrectly accepts low surrogates (#xdc00-#xdfff) + ;; as the first word. It should only accept high surrogates (#xd800-#xdbff). + ;; These tests verify that low surrogates followed by valid low surrogates + ;; are properly rejected (not incorrectly decoded as surrogate pairs). + (want-encoding-error #(#x00 #xdc #x00 #xdc) :utf-16le) + (want-encoding-error #(#x00 #xdc #xff #xdf) :utf-16le) + (want-encoding-error #(#xff #xdc #x00 #xdc) :utf-16le) + (want-encoding-error #(#x00 #xdd #x00 #xdd) :utf-16le) + (want-encoding-error #(#xff #xdf #xff #xdf) :utf-16le) + (want-encoding-error #(#xdc #x00 #xdc #x00) :utf-16be) + (want-encoding-error #(#xdf #xff #xdf #xff) :utf-16be) + (want-encoding-error #(#x00 #xdc #x41 #x00) :utf-16le)) (macrolet ((want-encoding-error (input format) `(with-expected-error (external-format-encoding-error) (read-flexi-line* ,input ,format))))