Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 60 additions & 35 deletions scripts/write-encode-map.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,77 +2,102 @@ import { writeFileSync } from "node:fs";
import htmlMap from "../maps/entities.json" with { type: "json" };

interface TrieNode {
/** The value, if the node has a value. */
value?: string | undefined;
/** A map with the next nodes, if there are any. */
next?: Map<number, TrieNode> | undefined;
}

const htmlTrie = getTrie(htmlMap);
const serialized = serializeTrieToString(htmlTrie);

/*
* Strip children from ASCII entries (0–127). The encoder's ASCII fast path
* uses a flat array lookup and never checks children, so multi-char entities
* starting with ASCII chars (like < + U+20D2 → &nvlt;) are unreachable.
* Removing them shrinks the serialized data without affecting behavior.
* The parser routes ASCII leaf entries to a separate array via `asciiOut`.
*/
for (let index = 0; index < 128; index++) {
const node = htmlTrie.get(index);
if (node?.next) {
if (node.value) {
htmlTrie.set(index, { value: node.value });
} else {
htmlTrie.delete(index);
}
}
}

const serialized = serializeTrie(htmlTrie);

writeFileSync(
new URL("../src/generated/encode-html.ts", import.meta.url),
`// Generated using scripts/write-encode-map.ts
// This file contains a compact, single-string serialization of the HTML encode trie.
// Format per entry (sequence in ascending code point order using diff encoding):
// <diffBase36>[&name;][{<children>}] -- diff omitted when 0.
// "&name;" gives the entity value for the node. A following { starts a nested sub-map.
// Diffs use the same scheme as before: diff = currentKey - previousKey - 1, first entry stores key.

import {
type EncodeTrieNode,
parseEncodeTrie,
} from "../internal/encode-shared.js";

/** Compact serialized HTML encode trie (intended to stay small & JS engine friendly) */
export const htmlTrie: Map<number, EncodeTrieNode> =
/* #__PURE__ */ parseEncodeTrie(
${JSON.stringify(serialized)},
);
export default ${JSON.stringify(serialized)};
`,
);

console.log("Done!");
console.log(`Done! Data: ${serialized.length} chars`);

/**
* Build the trie keyed by full Unicode code points (not UTF-16 char codes).
*
* This means astral characters (e.g. math script letters like 𝒜 = U+1D49C)
* are stored as flat entries at their code point, instead of as children of
* the high surrogate (U+D835). This eliminates the large D835 surrogate
* block and reduces the serialized data size significantly.
* @param map
*/
function getTrie(map: Record<string, string>): Map<number, TrieNode> {
const trie = new Map<number, TrieNode>();

for (const entity of Object.keys(map)) {
const decoded = map[entity];
// Resolve the key
let lastMap = trie;
for (let index = 0; index < decoded.length - 1; index++) {
const char = decoded.charCodeAt(index);
const next = lastMap.get(char) ?? {};
lastMap.set(char, next);

// Walk all code points except the last one, creating intermediate nodes.
let index = 0;
while (index < decoded.length) {
const cp = decoded.codePointAt(index)!;
const cpLength = cp > 0xff_ff ? 2 : 1;

// Check if this is the last code point in the sequence.
if (index + cpLength >= decoded.length) break;

const next = lastMap.get(cp) ?? {};
lastMap.set(cp, next);
lastMap = next.next ??= new Map();
index += cpLength;
}
const value = lastMap.get(decoded.charCodeAt(decoded.length - 1)) ?? {};
value.value ??= entity;
lastMap.set(decoded.charCodeAt(decoded.length - 1), value);

// Set the value on the final code point.
const lastCP = decoded.codePointAt(index)!;
const value = lastMap.get(lastCP) ?? {};
if (!value.value || entity.length < value.value.length) {
value.value = entity;
}
lastMap.set(lastCP, value);
}

return trie;
}

function serializeTrieToString(trie: Map<number, TrieNode>): string {
function serializeTrie(trie: Map<number, TrieNode>): string {
// @ts-expect-error `toSorted` requires a lib bump.
const entries = [...trie.entries()].toSorted((a, b) => a[0] - b[0]);
const entries = [...trie.entries()].toSorted(
(a: [number, TrieNode], b: [number, TrieNode]) => a[0] - b[0],
);
let out = "";
let lastKey = -1;
for (const [key, node] of entries) {
if (lastKey === -1) {
out += key.toString(36);
out += key.toString(10);
} else {
const diff = key - lastKey - 1;
if (diff !== 0) out += diff.toString(36);
if (diff !== 0) out += diff.toString(10);
}
if (node.value) out += `&${node.value};`;
if (node.value) out += `${node.value};`;
if (node.next) {
out += `{${serializeTrieToString(node.next)}}`;
} else if (!node.value) {
throw new Error("Invalid node: neither value nor next");
out += `{${serializeTrie(node.next)}}`;
}
lastKey = key;
}
Expand Down
4 changes: 3 additions & 1 deletion src/decode.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ describe("Decode test", () => {
});

describe("EntityDecoder", () => {
let callback: ReturnType<typeof vi.fn<(cp: number, consumed: number) => void>>;
let callback: ReturnType<
typeof vi.fn<(cp: number, consumed: number) => void>
>;
let decoder: entities.EntityDecoder;

beforeEach(() => {
Expand Down
12 changes: 6 additions & 6 deletions src/encode.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ describe("Encode->decode test", () => {
const testcases = [
{
input: "asdf & ÿ ü '",
xml: "asdf &amp; &#xff; &#xfc; &apos;",
xml: "asdf &amp; &#255; &#252; &apos;",
html: "asdf &amp; &yuml; &uuml; &apos;",
},
{
Expand Down Expand Up @@ -39,7 +39,7 @@ describe("Encode->decode test", () => {

it("should encode emojis", () =>
expect(entities.encodeHTML("😄🍾🥳💥😇")).toBe(
"&#x1f604;&#x1f37e;&#x1f973;&#x1f4a5;&#x1f607;",
"&#128516;&#127870;&#129395;&#128165;&#128519;",
));

it("should encode data URIs (issue #16)", () => {
Expand All @@ -58,10 +58,10 @@ describe("Encode->decode test", () => {
});

it("should encode trailing parts of entities", () =>
expect(entities.encodeHTML("\uD835")).toBe("&#xd835;"));
expect(entities.encodeHTML("\uD835")).toBe("&#55349;"));

it("should encode surrogate pair with first surrogate equivalent of entity, without corresponding entity", () =>
expect(entities.encodeHTML("\u{1D4A4}")).toBe("&#x1d4a4;"));
expect(entities.encodeHTML("\u{1D4A4}")).toBe("&#119972;"));
});

describe("encodeNonAsciiHTML", () => {
Expand All @@ -72,11 +72,11 @@ describe("encodeNonAsciiHTML", () => {

it("should encode emojis", () =>
expect(entities.encodeNonAsciiHTML("😄🍾🥳💥😇")).toBe(
"&#x1f604;&#x1f37e;&#x1f973;&#x1f4a5;&#x1f607;",
"&#128516;&#127870;&#129395;&#128165;&#128519;",
));

it("should encode chars above surrogates", () =>
expect(entities.encodeNonAsciiHTML("♒️♓️♈️♉️♊️♋️♌️♍️♎️♏️♐️♑️")).toBe(
"&#x2652;&#xfe0f;&#x2653;&#xfe0f;&#x2648;&#xfe0f;&#x2649;&#xfe0f;&#x264a;&#xfe0f;&#x264b;&#xfe0f;&#x264c;&#xfe0f;&#x264d;&#xfe0f;&#x264e;&#xfe0f;&#x264f;&#xfe0f;&#x2650;&#xfe0f;&#x2651;&#xfe0f;",
"&#9810;&#65039;&#9811;&#65039;&#9800;&#65039;&#9801;&#65039;&#9802;&#65039;&#9803;&#65039;&#9804;&#65039;&#9805;&#65039;&#9806;&#65039;&#9807;&#65039;&#9808;&#65039;&#9809;&#65039;",
));
});
Loading
Loading