From b49e137c93c52e71adf515511b44494cc174a993 Mon Sep 17 00:00:00 2001
From: Anthony Green <green@moxielogic.com>
Date: Fri, 21 Nov 2025 18:30:16 -0500
Subject: [PATCH] Fix UTF-16 surrogate pair validation

The UTF-16 decoder was incorrectly accepting low surrogates
(#xdc00-#xdfff) as the first word of a surrogate pair. According to
the UTF-16 specification, only high surrogates (#xd800-#xdbff) are
valid as the first word.

This fix changes the validation in both UTF-16 LE and BE decoders to
only accept the correct range of high surrogates, properly rejecting
invalid sequences where low surrogates appear as the first word.

Also added comprehensive test cases to verify that low surrogate pairs
are correctly rejected with encoding errors.
---
 decode.lisp    |  4 ++--
 test/test.lisp | 16 +++++++++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/decode.lisp b/decode.lisp
index 17be1bd..7565478 100644
--- a/decode.lisp
+++ b/decode.lisp
@@ -363,7 +363,7 @@ the form which is used to obtain the next octet."
         (declare (inline read-next-word))
         (let ((word (read-next-word)))
           (declare (type (unsigned-byte 16) word))
-          (cond ((<= #xd800 word #xdfff)
+          (cond ((<= #xd800 word #xdbff)
                  (let ((next-word (read-next-word)))
                    (declare (type (unsigned-byte 16) next-word))
                    (unless (<= #xdc00 next-word #xdfff)
@@ -394,7 +394,7 @@ the form which is used to obtain the next octet."
         (declare (inline read-next-word))
         (let ((word (read-next-word)))
           (declare (type (unsigned-byte 16) word))
-          (cond ((<= #xd800 word #xdfff)
+          (cond ((<= #xd800 word #xdbff)
                  (let ((next-word (read-next-word)))
                    (declare (type (unsigned-byte 16) next-word))
                    (unless (<= #xdc00 next-word #xdfff)
diff --git a/test/test.lisp b/test/test.lisp
index 1d88641..06dca6f 100644
--- a/test/test.lisp
+++ b/test/test.lisp
@@ -558,7 +558,21 @@ the external format EXTERNAL-FORMAT."
         (format t "~&Illegal code points"))
       (want-encoding-error #(#x00 #x00 #x11 #x00) :utf-32le)
       (want-encoding-error #(#x00 #xd8) :utf-16le)
-      (want-encoding-error #(#xff #xdf) :utf-16le))
+      (want-encoding-error #(#xff #xdf) :utf-16le)
+      (when verbose
+        (format t "~&Low surrogates cannot be the first word of a surrogate pair"))
+      ;; The bug: decode.lisp incorrectly accepts low surrogates (#xdc00-#xdfff)
+      ;; as the first word. It should only accept high surrogates (#xd800-#xdbff).
+      ;; These tests verify that low surrogates followed by valid low surrogates
+      ;; are properly rejected (not incorrectly decoded as surrogate pairs).
+      (want-encoding-error #(#x00 #xdc #x00 #xdc) :utf-16le)
+      (want-encoding-error #(#x00 #xdc #xff #xdf) :utf-16le)
+      (want-encoding-error #(#xff #xdc #x00 #xdc) :utf-16le)
+      (want-encoding-error #(#x00 #xdd #x00 #xdd) :utf-16le)
+      (want-encoding-error #(#xff #xdf #xff #xdf) :utf-16le)
+      (want-encoding-error #(#xdc #x00 #xdc #x00) :utf-16be)
+      (want-encoding-error #(#xdf #xff #xdf #xff) :utf-16be)
+      (want-encoding-error #(#x00 #xdc #x41 #x00) :utf-16le))
     (macrolet ((want-encoding-error (input format)
                  `(with-expected-error (external-format-encoding-error)
                     (read-flexi-line* ,input ,format))))