Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/decode.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ describe("Decode test", () => {
});

describe("EntityDecoder", () => {
let callback: ReturnType<typeof vi.fn<(cp: number, consumed: number) => void>>;
let callback: ReturnType<
typeof vi.fn<(cp: number, consumed: number) => void>
>;
let decoder: entities.EntityDecoder;

beforeEach(() => {
Expand Down
4 changes: 2 additions & 2 deletions src/encode.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { getCodePoint, XML_BITSET_VALUE } from "./escape.js";
import { XML_BITSET_VALUE } from "./escape.js";
import { htmlTrie } from "./generated/encode-html.js";

/**
Expand Down Expand Up @@ -79,7 +79,7 @@ function encodeHTMLTrieRe(bitset: Uint32Array, input: string): string {
}

if (node === undefined) {
const cp = getCodePoint(input, index);
const cp = input.codePointAt(index)!;
out += `&#x${cp.toString(16)};`;
if (cp !== char) index++;
last = index + 1;
Expand Down
112 changes: 43 additions & 69 deletions src/escape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,6 @@ const xmlCodeMap = new Map([
[62, "&gt;"],
]);

// For compatibility with node < 4, we wrap `codePointAt`
/**
* Read a code point at a given index.
* @param input Input string to encode or decode.
* @param index Current read position in the input string.
*/
export const getCodePoint: (c: string, index: number) => number =
typeof String.prototype.codePointAt === "function"
? (input: string, index: number): number => input.codePointAt(index)!
: // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
(c: string, index: number): number =>
(c.charCodeAt(index) & 0xfc_00) === 0xd8_00
? (c.charCodeAt(index) - 0xd8_00) * 0x4_00 +
c.charCodeAt(index + 1) -
0xdc_00 +
0x1_00_00
: c.charCodeAt(index);

/**
* Bitset for ASCII characters that need to be escaped in XML.
*/
Expand Down Expand Up @@ -64,7 +46,7 @@ export function encodeXML(input: string): string {
}

// Non-ASCII: encode as numeric entity (handle surrogate pair)
const cp = getCodePoint(input, index);
const cp = input.codePointAt(index)!;
out += `&#x${cp.toString(16)};`;
if (cp !== char) index++; // Skip trailing surrogate
last = index + 1;
Expand All @@ -86,75 +68,67 @@ export function encodeXML(input: string): string {
export const escape: typeof encodeXML = encodeXML;

/**
* Creates a function that escapes all characters matched by the given regular
* expression using the given map of characters to escape to their entities.
* @param regex Regular expression to match characters to escape.
* @param map Map of characters to escape to their entities.
* @returns Function that escapes all characters matched by the given regular
* expression using the given map of characters to escape to their entities.
* Replacement callback used by {@link escapeUTF8}, {@link escapeAttribute},
* and {@link escapeText} to map specific characters to their XML/HTML
* entity representations.
*
* Converts `"`, `&`, `'`, `<`, `>`, and non-breaking space (`\u00A0`)
* to their corresponding entities; returns all other characters unchanged.
* @param c Single character match from the respective escape RegExp.
*/
function getEscaper(
regex: RegExp,
map: Map<number, string>,
): (data: string) => string {
return function escape(data: string): string {
let match: RegExpExecArray | null;
let lastIndex = 0;
let result = "";

while ((match = regex.exec(data))) {
if (lastIndex !== match.index) {
result += data.substring(lastIndex, match.index);
}

// We know that this character will be in the map.
result += map.get(match[0].charCodeAt(0))!;

// Every match will be of length 1
lastIndex = match.index + 1;
function escapeReplacer(c: string): string {
switch (c) {
case '"': {
return "&quot;";
}

return result + data.substring(lastIndex);
};
case "&": {
return "&amp;";
}
case "'": {
return "&apos;";
}
case "<": {
return "&lt;";
}
case ">": {
return "&gt;";
}
case "\u00A0": {
return "&nbsp;";
}
}
return c;
}

const xmlEscapeRegex = /["&'<>]/g;
/**
* Encodes all characters not valid in XML documents using XML entities.
*
* Note that the output will be character-set dependent.
* @param data String to escape.
*/
export const escapeUTF8: (data: string) => string = /* #__PURE__ */ getEscaper(
/["&'<>]/g,
xmlCodeMap,
);
export function escapeUTF8(data: string): string {
return data.replace(xmlEscapeRegex, escapeReplacer);
}

const attributeEscapeRegex = /["&\u00A0]/g;

/**
* Encodes all characters that have to be escaped in HTML attributes,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
* @param data String to escape.
*/
export const escapeAttribute: (data: string) => string =
/* #__PURE__ */ getEscaper(
/["&\u00A0]/g,
new Map([
[34, "&quot;"],
[38, "&amp;"],
[160, "&nbsp;"],
]),
);
export function escapeAttribute(data: string): string {
return data.replace(attributeEscapeRegex, escapeReplacer);
}

const textEscapeRegex = /[&<>\u00A0]/g;

/**
* Encodes all characters that have to be escaped in HTML text,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
* @param data String to escape.
*/
export const escapeText: (data: string) => string = /* #__PURE__ */ getEscaper(
/[&<>\u00A0]/g,
new Map([
[38, "&amp;"],
[60, "&lt;"],
[62, "&gt;"],
[160, "&nbsp;"],
]),
);
export function escapeText(data: string): string {
return data.replace(textEscapeRegex, escapeReplacer);
}
Comment on lines +110 to +134
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Short-input performance regression in new replace-based hot paths

The PR benchmark shows a measurable throughput drop (~19% geometric mean). The new callback replace path is likely the source for short strings in escapeUTF8, escapeAttribute, and escapeText.

⚡ Proposed mitigation (fast no-match guard)
 const xmlEscapeRegex = /["&'<>]/g;
+const xmlEscapeCheckRegex = /["&'<>]/;
@@
 export function escapeUTF8(data: string): string {
-    return data.replace(xmlEscapeRegex, escapeReplacer);
+    return xmlEscapeCheckRegex.test(data)
+        ? data.replace(xmlEscapeRegex, escapeReplacer)
+        : data;
 }
 
 const attributeEscapeRegex = /["&\u00A0]/g;
+const attributeEscapeCheckRegex = /["&\u00A0]/;
@@
 export function escapeAttribute(data: string): string {
-    return data.replace(attributeEscapeRegex, escapeReplacer);
+    return attributeEscapeCheckRegex.test(data)
+        ? data.replace(attributeEscapeRegex, escapeReplacer)
+        : data;
 }
 
 const textEscapeRegex = /[&<>\u00A0]/g;
+const textEscapeCheckRegex = /[&<>\u00A0]/;
@@
 export function escapeText(data: string): string {
-    return data.replace(textEscapeRegex, escapeReplacer);
+    return textEscapeCheckRegex.test(data)
+        ? data.replace(textEscapeRegex, escapeReplacer)
+        : data;
 }
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@src/escape.ts` around lines 110 - 134, Add a fast no-match guard to avoid the
replace(callback) overhead for short/unchanged strings: in escapeUTF8,
escapeAttribute, and escapeText, check quickly for the presence of any
characters that would require escaping using simple string.indexOf/lookups (not
regex.test on a global regex) and return the original input early if none found,
then fall back to the existing return data.replace(..., escapeReplacer). For
escapeAttribute check for '"' '&' and '\u00A0'; for escapeText check for '&' '<'
'>' and '\u00A0'; for escapeUTF8 use the minimal set of characters that
xmlEscapeRegex would match. Ensure you do not call regex.test on a /g regex to
avoid stateful behavior.

Loading