Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 42 additions & 33 deletions scripts/write-encode-map.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,70 +9,79 @@ interface TrieNode {
}

const htmlTrie = getTrie(htmlMap);
const serialized = serializeTrieToString(htmlTrie);

const serialized = serializeTrie(htmlTrie);

writeFileSync(
new URL("../src/generated/encode-html.ts", import.meta.url),
`// Generated using scripts/write-encode-map.ts
// This file contains a compact, single-string serialization of the HTML encode trie.
// Format per entry (sequence in ascending code point order using diff encoding):
// <diffBase36>[&name;][{<children>}] -- diff omitted when 0.
// "&name;" gives the entity value for the node. A following { starts a nested sub-map.
// Diffs use the same scheme as before: diff = currentKey - previousKey - 1, first entry stores key.

import {
type EncodeTrieNode,
parseEncodeTrie,
} from "../internal/encode-shared.js";

/** Compact serialized HTML encode trie (intended to stay small & JS engine friendly) */
export const htmlTrie: Map<number, EncodeTrieNode> =
/* #__PURE__ */ parseEncodeTrie(
${JSON.stringify(serialized)},
);
export default ${JSON.stringify(serialized)};
`,
);

console.log("Done!");
console.log(`Done! Data: ${serialized.length} chars`);

/**
* Build the trie keyed by full Unicode code points (not UTF-16 char codes).
*
* This means astral characters (e.g. math script letters like 𝒜 = U+1D49C)
* are stored as flat entries at their code point, instead of as children of
* the high surrogate (U+D835). This eliminates the large D835 surrogate
* block and reduces the serialized data size significantly.
* @param map
*/
function getTrie(map: Record<string, string>): Map<number, TrieNode> {
const trie = new Map<number, TrieNode>();

for (const entity of Object.keys(map)) {
const decoded = map[entity];
// Resolve the key
let lastMap = trie;
for (let index = 0; index < decoded.length - 1; index++) {
const char = decoded.charCodeAt(index);
const next = lastMap.get(char) ?? {};
lastMap.set(char, next);

// Walk all code points except the last one, creating intermediate nodes.
let index = 0;
while (index < decoded.length) {
const cp = decoded.codePointAt(index)!;
const cpLength = cp > 0xff_ff ? 2 : 1;

// Check if this is the last code point in the sequence.
if (index + cpLength >= decoded.length) break;

const next = lastMap.get(cp) ?? {};
lastMap.set(cp, next);
lastMap = next.next ??= new Map();
index += cpLength;
}

// Set the value on the final code point.
const lastCP = decoded.codePointAt(index)!;
const value = lastMap.get(lastCP) ?? {};
if (!value.value || entity.length < value.value.length) {
value.value = entity;
}
const value = lastMap.get(decoded.charCodeAt(decoded.length - 1)) ?? {};
value.value ??= entity;
lastMap.set(decoded.charCodeAt(decoded.length - 1), value);
lastMap.set(lastCP, value);
}

return trie;
}

function serializeTrieToString(trie: Map<number, TrieNode>): string {
function serializeTrie(trie: Map<number, TrieNode>): string {
// @ts-expect-error `toSorted` requires a lib bump.
const entries = [...trie.entries()].toSorted((a, b) => a[0] - b[0]);
const entries = [...trie.entries()].toSorted(
(a: [number, TrieNode], b: [number, TrieNode]) => a[0] - b[0],
);
let out = "";
let lastKey = -1;
for (const [key, node] of entries) {
if (lastKey === -1) {
out += key.toString(36);
out += key.toString(10);
} else {
const diff = key - lastKey - 1;
if (diff !== 0) out += diff.toString(36);
if (diff !== 0) out += diff.toString(10);
}
if (node.value) out += `&${node.value};`;
if (node.value) out += `${node.value};`;
if (node.next) {
out += `{${serializeTrieToString(node.next)}}`;
} else if (!node.value) {
throw new Error("Invalid node: neither value nor next");
out += `{${serializeTrie(node.next)}}`;
}
lastKey = key;
}
Expand Down
4 changes: 3 additions & 1 deletion src/decode.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ describe("Decode test", () => {
});

describe("EntityDecoder", () => {
let callback: ReturnType<typeof vi.fn<(cp: number, consumed: number) => void>>;
let callback: ReturnType<
typeof vi.fn<(cp: number, consumed: number) => void>
>;
let decoder: entities.EntityDecoder;

beforeEach(() => {
Expand Down
32 changes: 26 additions & 6 deletions src/encode.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ describe("Encode->decode test", () => {
const testcases = [
{
input: "asdf & ÿ ü '",
xml: "asdf &amp; &#xff; &#xfc; &apos;",
xml: "asdf &amp; &#255; &#252; &apos;",
html: "asdf &amp; &yuml; &uuml; &apos;",
},
{
Expand Down Expand Up @@ -39,7 +39,7 @@ describe("Encode->decode test", () => {

it("should encode emojis", () =>
expect(entities.encodeHTML("😄🍾🥳💥😇")).toBe(
"&#x1f604;&#x1f37e;&#x1f973;&#x1f4a5;&#x1f607;",
"&#128516;&#127870;&#129395;&#128165;&#128519;",
));

it("should encode data URIs (issue #16)", () => {
Expand All @@ -58,10 +58,30 @@ describe("Encode->decode test", () => {
});

it("should encode trailing parts of entities", () =>
expect(entities.encodeHTML("\uD835")).toBe("&#xd835;"));
expect(entities.encodeHTML("\uD835")).toBe("&#55349;"));

it("should encode surrogate pair with first surrogate equivalent of entity, without corresponding entity", () =>
expect(entities.encodeHTML("\u{1D4A4}")).toBe("&#x1d4a4;"));
expect(entities.encodeHTML("\u{1D4A4}")).toBe("&#119972;"));
});

describe("multi-code-point entities with ASCII starters", () => {
it("should encode < + U+20D2 as &nvlt;", () =>
expect(entities.encodeHTML("<\u20D2")).toBe("&nvlt;"));

it("should encode > + U+20D2 as &nvgt;", () =>
expect(entities.encodeHTML(">\u20D2")).toBe("&nvgt;"));

it("should encode = + U+20E5 as &bne;", () =>
expect(entities.encodeHTML("=\u20E5")).toBe("&bne;"));

it("should still encode < alone as &lt;", () =>
expect(entities.encodeHTML("<")).toBe("&lt;"));

it("should still encode > alone as &gt;", () =>
expect(entities.encodeHTML(">")).toBe("&gt;"));

it("should encode < followed by unrelated char as &lt; + numeric", () =>
expect(entities.encodeHTML("<\u20D3")).toBe("&lt;&#8403;"));
});

describe("encodeNonAsciiHTML", () => {
Expand All @@ -72,11 +92,11 @@ describe("encodeNonAsciiHTML", () => {

it("should encode emojis", () =>
expect(entities.encodeNonAsciiHTML("😄🍾🥳💥😇")).toBe(
"&#x1f604;&#x1f37e;&#x1f973;&#x1f4a5;&#x1f607;",
"&#128516;&#127870;&#129395;&#128165;&#128519;",
));

it("should encode chars above surrogates", () =>
expect(entities.encodeNonAsciiHTML("♒️♓️♈️♉️♊️♋️♌️♍️♎️♏️♐️♑️")).toBe(
"&#x2652;&#xfe0f;&#x2653;&#xfe0f;&#x2648;&#xfe0f;&#x2649;&#xfe0f;&#x264a;&#xfe0f;&#x264b;&#xfe0f;&#x264c;&#xfe0f;&#x264d;&#xfe0f;&#x264e;&#xfe0f;&#x264f;&#xfe0f;&#x2650;&#xfe0f;&#x2651;&#xfe0f;",
"&#9810;&#65039;&#9811;&#65039;&#9800;&#65039;&#9801;&#65039;&#9802;&#65039;&#9803;&#65039;&#9804;&#65039;&#9805;&#65039;&#9806;&#65039;&#9807;&#65039;&#9808;&#65039;&#9809;&#65039;",
));
});
Loading
Loading