diff --git a/scripts/write-encode-map.ts b/scripts/write-encode-map.ts index 7044a273..3d62811b 100644 --- a/scripts/write-encode-map.ts +++ b/scripts/write-encode-map.ts @@ -9,70 +9,79 @@ interface TrieNode { } const htmlTrie = getTrie(htmlMap); -const serialized = serializeTrieToString(htmlTrie); + +const serialized = serializeTrie(htmlTrie); writeFileSync( new URL("../src/generated/encode-html.ts", import.meta.url), `// Generated using scripts/write-encode-map.ts -// This file contains a compact, single-string serialization of the HTML encode trie. -// Format per entry (sequence in ascending code point order using diff encoding): -// [&name;][{}] -- diff omitted when 0. -// "&name;" gives the entity value for the node. A following { starts a nested sub-map. -// Diffs use the same scheme as before: diff = currentKey - previousKey - 1, first entry stores key. - -import { - type EncodeTrieNode, - parseEncodeTrie, - } from "../internal/encode-shared.js"; -/** Compact serialized HTML encode trie (intended to stay small & JS engine friendly) */ -export const htmlTrie: Map = - /* #__PURE__ */ parseEncodeTrie( - ${JSON.stringify(serialized)}, - ); +export default ${JSON.stringify(serialized)}; `, ); -console.log("Done!"); +console.log(`Done! Data: ${serialized.length} chars`); +/** + * Build the trie keyed by full Unicode code points (not UTF-16 char codes). + * + * This means astral characters (e.g. math script letters like π’œ = U+1D49C) + * are stored as flat entries at their code point, instead of as children of + * the high surrogate (U+D835). This eliminates the large D835 surrogate + * block and reduces the serialized data size significantly. + * @param map + */ function getTrie(map: Record): Map { const trie = new Map(); for (const entity of Object.keys(map)) { const decoded = map[entity]; - // Resolve the key let lastMap = trie; - for (let index = 0; index < decoded.length - 1; index++) { - const char = decoded.charCodeAt(index); - const next = lastMap.get(char) ?? {}; - lastMap.set(char, next); + + // Walk all code points except the last one, creating intermediate nodes. + let index = 0; + while (index < decoded.length) { + const cp = decoded.codePointAt(index)!; + const cpLength = cp > 0xff_ff ? 2 : 1; + + // Check if this is the last code point in the sequence. + if (index + cpLength >= decoded.length) break; + + const next = lastMap.get(cp) ?? {}; + lastMap.set(cp, next); lastMap = next.next ??= new Map(); + index += cpLength; + } + + // Set the value on the final code point. + const lastCP = decoded.codePointAt(index)!; + const value = lastMap.get(lastCP) ?? {}; + if (!value.value || entity.length < value.value.length) { + value.value = entity; } - const value = lastMap.get(decoded.charCodeAt(decoded.length - 1)) ?? {}; - value.value ??= entity; - lastMap.set(decoded.charCodeAt(decoded.length - 1), value); + lastMap.set(lastCP, value); } return trie; } -function serializeTrieToString(trie: Map): string { +function serializeTrie(trie: Map): string { // @ts-expect-error `toSorted` requires a lib bump. - const entries = [...trie.entries()].toSorted((a, b) => a[0] - b[0]); + const entries = [...trie.entries()].toSorted( + (a: [number, TrieNode], b: [number, TrieNode]) => a[0] - b[0], + ); let out = ""; let lastKey = -1; for (const [key, node] of entries) { if (lastKey === -1) { - out += key.toString(36); + out += key.toString(10); } else { const diff = key - lastKey - 1; - if (diff !== 0) out += diff.toString(36); + if (diff !== 0) out += diff.toString(10); } - if (node.value) out += `&${node.value};`; + if (node.value) out += `${node.value};`; if (node.next) { - out += `{${serializeTrieToString(node.next)}}`; - } else if (!node.value) { - throw new Error("Invalid node: neither value nor next"); + out += `{${serializeTrie(node.next)}}`; } lastKey = key; } diff --git a/src/decode.spec.ts b/src/decode.spec.ts index 64e2ef03..18888eba 100644 --- a/src/decode.spec.ts +++ b/src/decode.spec.ts @@ -71,7 +71,9 @@ describe("Decode test", () => { }); describe("EntityDecoder", () => { - let callback: ReturnType void>>; + let callback: ReturnType< + typeof vi.fn<(cp: number, consumed: number) => void> + >; let decoder: entities.EntityDecoder; beforeEach(() => { diff --git a/src/encode.spec.ts b/src/encode.spec.ts index 7c1dfdef..d85df1ee 100644 --- a/src/encode.spec.ts +++ b/src/encode.spec.ts @@ -5,7 +5,7 @@ describe("Encode->decode test", () => { const testcases = [ { input: "asdf & ΓΏ ΓΌ '", - xml: "asdf & ÿ ü '", + xml: "asdf & ÿ ü '", html: "asdf & ÿ ü '", }, { @@ -39,7 +39,7 @@ describe("Encode->decode test", () => { it("should encode emojis", () => expect(entities.encodeHTML("πŸ˜„πŸΎπŸ₯³πŸ’₯πŸ˜‡")).toBe( - "😄🍾🥳💥😇", + "😄🍾🥳💥😇", )); it("should encode data URIs (issue #16)", () => { @@ -58,10 +58,30 @@ describe("Encode->decode test", () => { }); it("should encode trailing parts of entities", () => - expect(entities.encodeHTML("\uD835")).toBe("�")); + expect(entities.encodeHTML("\uD835")).toBe("�")); it("should encode surrogate pair with first surrogate equivalent of entity, without corresponding entity", () => - expect(entities.encodeHTML("\u{1D4A4}")).toBe("𝒤")); + expect(entities.encodeHTML("\u{1D4A4}")).toBe("𝒤")); +}); + +describe("multi-code-point entities with ASCII starters", () => { + it("should encode < + U+20D2 as <⃒", () => + expect(entities.encodeHTML("<\u20D2")).toBe("<⃒")); + + it("should encode > + U+20D2 as >⃒", () => + expect(entities.encodeHTML(">\u20D2")).toBe(">⃒")); + + it("should encode = + U+20E5 as =⃥", () => + expect(entities.encodeHTML("=\u20E5")).toBe("=⃥")); + + it("should still encode < alone as <", () => + expect(entities.encodeHTML("<")).toBe("<")); + + it("should still encode > alone as >", () => + expect(entities.encodeHTML(">")).toBe(">")); + + it("should encode < followed by unrelated char as < + numeric", () => + expect(entities.encodeHTML("<\u20D3")).toBe("<⃓")); }); describe("encodeNonAsciiHTML", () => { @@ -72,11 +92,11 @@ describe("encodeNonAsciiHTML", () => { it("should encode emojis", () => expect(entities.encodeNonAsciiHTML("πŸ˜„πŸΎπŸ₯³πŸ’₯πŸ˜‡")).toBe( - "😄🍾🥳💥😇", + "😄🍾🥳💥😇", )); it("should encode chars above surrogates", () => expect(entities.encodeNonAsciiHTML("β™’οΈβ™“οΈβ™ˆοΈβ™‰οΈβ™ŠοΈβ™‹οΈβ™ŒοΈβ™οΈβ™ŽοΈβ™οΈβ™οΈβ™‘οΈ")).toBe( - "♒️♓️♈️♉️♊️♋️♌️♍️♎️♏️♐️♑️", + "♒️♓️♈️♉️♊️♋️♌️♍️♎️♏️♐️♑️", )); }); diff --git a/src/encode.ts b/src/encode.ts index 717c79f9..137ca566 100644 --- a/src/encode.ts +++ b/src/encode.ts @@ -1,18 +1,152 @@ -import { getCodePoint, XML_BITSET_VALUE } from "./escape.js"; -import { htmlTrie } from "./generated/encode-html.js"; +import { XML_BITSET_VALUE } from "./escape.js"; +import htmlTrieData from "./generated/encode-html.js"; /** - * We store the characters to consider as a compact bitset for fast lookups. + * A node inside the encoding trie used by `encode.ts`. + * + * There are two physical shapes to minimize allocations and lookup cost: + * + * 1. Leaf node (string) + * - A plain string (already in the form `"&name;"`). + * - Represents a terminal match with no children. + * + * 2. Branch / value node (object) + */ +type EncodeTrieNode = + | string + | { value: string | null; next: Map }; + +/** + * Flat lookup table for ASCII entity values (code points 0–127). + * + * Built once at startup from the full trie. For ASCII characters that need + * encoding, a direct `asciiEntities[charCode]` array index is much faster + * than `Map.get(charCode)` because it avoids hashing and bucket lookup. + * + * A few ASCII characters also have multi-code-point children in the trie + * (e.g. `<` + U+20D2 β†’ `<⃒`). The encoder checks the trie for those + * multi-char matches first, then falls back to this table for the + * single-char entity. + */ +const asciiEntities: (string | null)[] = []; + +const htmlTrie: Map = (() => { + /** + * Parse a compact encode trie string into a Map keyed by code point. + * + * The serialized format (produced by `scripts/write-encode-map.ts`) stores + * entries in ascending code-point order with delta encoding: + * + * [gap]name;[{children}] + * + * - `gap` is a base-10 integer giving `currentKey - previousKey - 1`. + * The very first entry stores the absolute key. A gap of 0 is omitted. + * - `name;` is the entity name (without `&` prefix) terminated by `;`. + * Because gaps use only digits `[0-9]` and entity names always start with + * a letter `[A-Za-z]`, no additional delimiter is needed. + * - `{…}` is an optional children block using the same scheme recursively. + * Children represent the second code unit of multi-character entities + * (e.g. `<` + U+20D2 β†’ `<⃒`). + */ + const trie = new Map(); + const data = htmlTrieData; + let cursor = 0; + let lastKey = -1; + + function readGap(): number { + let value = 0; + let ch: number; + while ( + (ch = data.charCodeAt(cursor)) >= 48 /* '0' */ && + ch <= 57 /* '9' */ + ) { + value = value * 10 + ch - 48; + cursor++; + } + return value; + } + + function readEntity(): string { + const semi = data.indexOf(";", cursor); + const entity = `&${data.substring(cursor, semi)};`; + cursor = semi + 1; + return entity; + } + + const astralEntries: [number, string][] = []; + + while (cursor < data.length) { + lastKey += readGap() + 1; + + const entityValue = + data.charCodeAt(cursor) === 123 /* '{' */ ? null : readEntity(); + + if (data.charCodeAt(cursor) === 123 /* '{' */) { + cursor++; // Skip '{' + const next = new Map(); + let childKey = -1; + while (data.charCodeAt(cursor) !== 125 /* '}' */) { + childKey += readGap() + 1; + next.set(childKey, readEntity()); + } + trie.set(lastKey, { value: entityValue, next }); + cursor++; // Skip '}' + // Also populate the ASCII fast-path table for the single-char value. + if (lastKey < 0x80 && entityValue != null) { + asciiEntities[lastKey] = entityValue; + } + } else if (lastKey < 0x80) { + asciiEntities[lastKey] = entityValue; + } else if (lastKey > 0xff_ff) { + astralEntries.push([lastKey, entityValue!]); + } else { + trie.set(lastKey, entityValue!); + } + } + + /* + * Batch-insert astral entries as surrogate-pair trie nodes. + * Entries are sorted by code point, so same-high-surrogate groups + * are contiguous β€” no intermediate grouping Map needed. + */ + let astralIndex = 0; + while (astralIndex < astralEntries.length) { + const hi = + 0xd8_00 | ((astralEntries[astralIndex][0] - 0x1_00_00) >> 10); + const children: [number, string][] = []; + while ( + astralIndex < astralEntries.length && + (0xd8_00 | ((astralEntries[astralIndex][0] - 0x1_00_00) >> 10)) === + hi + ) { + const lo = + 0xdc_00 | + ((astralEntries[astralIndex][0] - 0x1_00_00) & 0x3_ff); + children.push([lo, astralEntries[astralIndex][1]]); + astralIndex++; + } + trie.set(hi, { value: null, next: new Map(children) }); + } + + return trie; +})(); + +/** + * Bitset covering ASCII code points 0–127. Each of the four 32-bit words + * covers a 32-code-point range. A set bit means "this character needs + * encoding" when used with `encodeHTML`. */ const HTML_BITSET = /* #__PURE__ */ new Uint32Array([ - 0x16_00, // Bits for 09,0A,0C - 0xfc_00_ff_fe, // 32..63 -> 21-2D (minus space), 2E,2F,3A-3F - 0xf8_00_00_01, // 64..95 -> 40, 5B-5F - 0x38_00_00_01, // 96..127-> 60, 7B-7D + 0x16_00, // 09 (\t), 0A (\n), 0C (\f) + 0xfc_00_ff_fe, // 21-2D (!-.), 2E (.), 2F (/), 3A-3F (:;<=>?) + 0xf8_00_00_01, // 40 (@), 5B-5F ([\]^_) + 0x38_00_00_01, // 60 (`), 7B-7D ({|}) ]); const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]); +const numericReference = (cp: number) => `&#${cp};`; + /** * Encodes all characters in the input using HTML entities. This includes * characters that are valid ASCII characters in HTML documents, such as `#`. @@ -21,8 +155,8 @@ const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]); * function, which will only encode characters that are not valid in HTML * documents, as well as non-ASCII characters. * - * If a character has no equivalent entity, a numeric hexadecimal reference - * (eg. `ü`) will be used. + * If a character has no equivalent entity, a numeric decimal reference + * (eg. `ü`) will be used. * @param input Input string to encode or decode. */ export function encodeHTML(input: string): string { @@ -33,8 +167,8 @@ export function encodeHTML(input: string): string { * documents using HTML entities. This function will not encode characters that * are valid in HTML documents, such as `#`. * - * If a character has no equivalent entity, a numeric hexadecimal reference - * (eg. `ü`) will be used. + * If a character has no equivalent entity, a numeric decimal reference + * (eg. `ü`) will be used. * @param input Input string to encode or decode. */ export function encodeNonAsciiHTML(input: string): string { @@ -48,48 +182,66 @@ function encodeHTMLTrieRe(bitset: Uint32Array, input: string): string { for (let index = 0; index < length; index++) { const char = input.charCodeAt(index); - // Skip ASCII characters that don't need encoding + + /* + * Fast-skip ASCII characters that don't need encoding. + * The bitset has one bit per ASCII code point; a set bit means "encode". + */ if (char < 0x80 && !((bitset[char >>> 5] >>> char) & 1)) { continue; } - if (out === undefined) out = input.substring(0, index); + // Lazy-init: copy the prefix before the first character that needs encoding. + if (out == null) out = input.substring(0, index); else if (last !== index) out += input.substring(last, index); - let node = htmlTrie.get(char); - - if (typeof node === "object") { - if (index + 1 < length) { - const nextChar = input.charCodeAt(index + 1); - const value = - typeof node.next === "number" - ? node.next === nextChar - ? node.nextValue - : undefined - : node.next.get(nextChar); - - if (value !== undefined) { + if (char < 0x80) { + // ASCII: check for multi-code-point entity first (e.g. < + U+20D2 β†’ <⃒). + const trieNode = htmlTrie.get(char); + if (typeof trieNode === "object" && index + 1 < length) { + const value = trieNode.next.get(input.charCodeAt(index + 1)); + if (value != null) { out += value; index++; last = index + 1; continue; } } - node = node.value; - } - - if (node === undefined) { - const cp = getCodePoint(input, index); - out += `&#x${cp.toString(16)};`; - if (cp !== char) index++; - last = index + 1; + // Fast path: direct array lookup for single-char entity. + const entity = asciiEntities[char]; + out += entity ?? numericReference(char); } else { - out += node; - last = index + 1; + // Non-ASCII: full trie lookup with multi-char entity support. + let node: EncodeTrieNode | undefined | null = htmlTrie.get(char); + + if (typeof node === "object") { + if (index + 1 < length) { + const value = node.next.get(input.charCodeAt(index + 1)); + if (value != null) { + out += value; + index++; + last = index + 1; + continue; + } + } + node = node.value; + } + + if (node == null) { + // No named entity exists; emit a decimal numeric reference. + const cp = input.codePointAt(index)!; + out += numericReference(cp); + // Astral code points consume two UTF-16 code units. + if (cp !== char) index++; + } else { + out += node; + } } + last = index + 1; } - if (out === undefined) return input; + // If nothing needed encoding, return the original string (avoids allocation). + if (out == null) return input; if (last < length) out += input.substr(last); return out; } diff --git a/src/escape.ts b/src/escape.ts index f7f68bca..e7d327f9 100644 --- a/src/escape.ts +++ b/src/escape.ts @@ -33,8 +33,8 @@ export const XML_BITSET_VALUE = 0x50_00_00_c4; // 32..63 -> 34 ("),38 (&),39 (') * Encodes all non-ASCII characters, as well as characters not valid in XML * documents using XML entities. Uses a fast bitset scan instead of RegExp. * - * If a character has no equivalent entity, a numeric hexadecimal reference - * (eg. `ü`) will be used. + * If a character has no equivalent entity, a numeric decimal reference + * (eg. `ü`) will be used. * @param input Input string to encode or decode. */ export function encodeXML(input: string): string { @@ -64,8 +64,8 @@ export function encodeXML(input: string): string { } // Non-ASCII: encode as numeric entity (handle surrogate pair) - const cp = getCodePoint(input, index); - out += `&#x${cp.toString(16)};`; + const cp = input.codePointAt(index)!; + out += `&#${cp};`; if (cp !== char) index++; // Skip trailing surrogate last = index + 1; } @@ -77,7 +77,7 @@ export function encodeXML(input: string): string { /** * Encodes all non-ASCII characters, as well as characters not valid in XML - * documents using numeric hexadecimal reference (eg. `ü`). + * documents using numeric decimal reference (eg. `ü`). * * Have a look at `escapeUTF8` if you want a more concise output at the expense * of reduced transportability. diff --git a/src/generated/encode-html.ts b/src/generated/encode-html.ts index 97e4a61d..928b3f0d 100644 --- a/src/generated/encode-html.ts +++ b/src/generated/encode-html.ts @@ -1,18 +1,3 @@ // Generated using scripts/write-encode-map.ts -// This file contains a compact, single-string serialization of the HTML encode trie. -// Format per entry (sequence in ascending code point order using diff encoding): -// [&name;][{}] -- diff omitted when 0. -// "&name;" gives the entity value for the node. A following { starts a nested sub-map. -// Diffs use the same scheme as before: diff = currentKey - previousKey - 1, first entry stores key. -import { - type EncodeTrieNode, - parseEncodeTrie, -} from "../internal/encode-shared.js"; - -/** Compact serialized HTML encode trie (intended to stay small & JS engine friendly) */ -/** HTML entity encode trie. */ -export const htmlTrie: Map = - /* #__PURE__ */ parseEncodeTrie( - "9 m!"#$%&'()*+,1./a:;<{6he<⃒}={6hx=⃥}>{6he>⃒}?@q[\]^_`5{2yfj}k{|}y ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒē2ĖėĘęĚěĜĝĞğĠġĢ1ĤĥĦħĨĩĪī2ĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁłŃńŅņŇňʼnŊŋŌō2ŐőŒœŔŕŖŗŘřŚśŜŝŞşŠšŢţŤťŦŧŨũŪūŬŭŮůŰűŲųŴŵŶŷŸŹźŻżŽžjƒyƵ1rǵ1tȷ3yˆˇg˘˙˚˛˜˝1f̑3jΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡ1ΣΤΥΦΧΨΩ7αβγδεζηθικλμνξοπρςστυφχψω7ϑϒ2ϕϖ5Ϝϝiϰϱ3ϵ϶aЁЂЃЄЅІЇЈЉЊЋЌ1ЎЏАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдежзийклмнопрстуфхцчшщъыьэюя1ёђѓєѕіїјљњћќ1ўџ5gi    1    ​‌‍‎‏‐2–—―‖1‘’‚1“”„1†‡•2‥…9‰‱′″‴‵3‹›3‾2⁁1⁃⁄a⁏7⁗7 {6bu  }⁠⁡⁢⁣20€1a⃛⃜11ℂ2℅4ℊℋℌℍℎℏℐℑℒℓ1ℕ№℗℘ℙℚℛℜℝ℞3™1ℤ2℧ℨ℩2ℬℭ1ℯℰℱ1ℳℴℵℶℷℸcⅅⅆⅇⅈa⅓⅔⅕⅖⅗⅘⅙⅚⅛⅜⅝⅞1d←↑→↓↔↕↖↗↘↙↚↛1↝{mw↝̸}↞↟↠↡↢↣↤↥↦↧1↩↪↫↬↭↮1↰↱↲↳1↵↶↷2↺↻↼↽↾↿⇀⇁⇂⇃⇄⇅⇆⇇⇈⇉⇊⇋⇌⇍⇎⇏⇐⇑⇒⇓⇔⇕⇖⇗⇘⇙⇚⇛1⇝6⇤⇥f⇵7⇽⇾⇿∀∁∂{mw∂̸}∃∄∅1∇∈∉1∋∌2∏∐∑−∓∔1∖∗∘1√2∝∞∟∠{6he∠⃒}∡∢∣∤∥∦∧∨∩{1e68∩︀}∪{1e68∪︀}∫∬∭∮∯∰∱∲∳∴∵∶∷∸1∺∻∼{6he∼⃒}∽{mp∽̱}∾{mr∾̳}∿≀≁≂{mw≂̸}≃≄≅≆≇≈≉≊≋{mw≋̸}≌≍{6he≍⃒}≎{mw≎̸}≏{mw≏̸}≐{mw≐̸}≑≒≓≔≕≖≗1≙≚1≜2≟≠≡{6hx≡⃥}≢1≤{6he≤⃒}≥{6he≥⃒}≦{mw≦̸}≧{mw≧̸}≨{1e68≨︀}≩{1e68≩︀}≪{mw≪̸5uh≪⃒}≫{mw≫̸5uh≫⃒}≬≭≮≯≰≱≲≳≴≵≶≷≸≹≺≻≼≽≾≿{mw≿̸}⊀⊁⊂{6he⊂⃒}⊃{6he⊃⃒}⊄⊅⊆⊇⊈⊉⊊{1e68⊊︀}⊋{1e68⊋︀}1⊍⊎⊏{mw⊏̸}⊐{mw⊐̸}⊑⊒⊓{1e68⊓︀}⊔{1e68⊔︀}⊕⊖⊗⊘⊙⊚⊛1⊝⊞⊟⊠⊡⊢⊣⊤⊥1⊧⊨⊩⊪⊫⊬⊭⊮⊯⊰1⊲⊳⊴{6he⊴⃒}⊵{6he⊵⃒}⊶⊷⊸⊹⊺⊻1⊽⊾⊿⋀⋁⋂⋃⋄⋅⋆⋇⋈⋉⋊⋋⋌⋍⋎⋏⋐⋑⋒⋓⋔⋕⋖⋗⋘{mw⋘̸}⋙{mw⋙̸}⋚{1e68⋚︀}⋛{1e68⋛︀}2⋞⋟⋠⋡⋢⋣2⋦⋧⋨⋩⋪⋫⋬⋭⋮⋯⋰⋱⋲⋳⋴⋵{mw⋵̸}⋶⋷1⋹{mw⋹̸}⋺⋻⋼⋽⋾6⌅⌆1⌈⌉⌊⌋⌌⌍⌎⌏⌐1⌒⌓1⌕⌖5⌜⌝⌞⌟2⌢⌣9⌭⌮7⌶6⌽1⌿1o⍼1f⎰⎱2⎴⎵⎶11⏜⏝⏞⏟2⏢4⏧1n␣4kⓈ1j─1│9┌3┐3└3┘3├7┤7┬7┴7┼j═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬j▀3▄3█8░▒▓d□8▪▫1▭▮2▱1△▴▵2▸▹3▽▾▿2◂◃6◊○w◬2◯8◸◹◺◻◼8★☆7☎1d♀1♂t♠2♣1♥♦3♪2♭♮♯4j✓3✗8✠l✶x❘p❲❳2c⟈⟉s⟦⟧⟨⟩⟪⟫⟬⟭7⟵⟶⟷⟸⟹⟺1⟼2⟿76⤂⤃⤄⤅6⤌⤍⤎⤏⤐⤑⤒⤓2⤖2⤙⤚⤛⤜⤝⤞⤟⤠2⤣⤤⤥⤦⤧⤨⤩⤪8⤳{mw⤳̸}1⤵⤶⤷⤸⤹2⤼⤽7⥅2⥈⥉⥊⥋2⥎⥏⥐⥑⥒⥓⥔⥕⥖⥗⥘⥙⥚⥛⥜⥝⥞⥟⥠⥡⥢⥣⥤⥥⥦⥧⥨⥩⥪⥫⥬⥭⥮⥯⥰⥱⥲⥳⥴⥵⥶1⥸⥹1⥻⥼⥽⥾⥿5⦅⦆4⦋⦌⦍⦎⦏⦐⦑⦒⦓⦔⦕⦖3⦚1⦜⦝6⦤⦥⦦⦧⦨⦩⦪⦫⦬⦭⦮⦯⦰⦱⦲⦳⦴⦵⦶⦷1⦹1⦻⦼1⦾⦿⧀⧁⧂⧃⧄⧅3⧉3⧍⧎⧏{mw⧏̸}⧐{mw⧐̸}b⧜⧝⧞4⧣⧤⧥5⧫8⧴1⧶9⨀⨁⨂1⨄1⨆5⨌⨍2⨐⨑⨒⨓⨔⨕⨖⨗a⨢⨣⨤⨥⨦⨧1⨩⨪2⨭⨮⨯⨰⨱1⨳⨴⨵⨶⨷⨸⨹⨺⨻⨼2⨿⩀1⩂⩃⩄⩅⩆⩇⩈⩉⩊⩋⩌⩍2⩐2⩓⩔⩕⩖⩗⩘1⩚⩛⩜⩝1⩟6⩦3⩪2⩭{mw⩭̸}⩮⩯⩰{mw⩰̸}⩱⩲⩳⩴⩵1⩷⩸⩹⩺⩻⩼⩽{mw⩽̸}⩾{mw⩾̸}⩿⪀⪁⪂⪃⪄⪅⪆⪇⪈⪉⪊⪋⪌⪍⪎⪏⪐⪑⪒⪓⪔⪕⪖⪗⪘⪙⪚2⪝⪞⪟⪠⪡{mw⪡̸}⪢{mw⪢̸}1⪤⪥⪦⪧⪨⪩⪪⪫⪬{1e68⪬︀}⪭{1e68⪭︀}⪮⪯{mw⪯̸}⪰{mw⪰̸}2⪳⪴⪵⪶⪷⪸⪹⪺⪻⪼⪽⪾⪿⫀⫁⫂⫃⫄⫅{mw⫅̸}⫆{mw⫆̸}⫇⫈2⫋{1e68⫋︀}⫌{1e68⫌︀}2⫏⫐⫑⫒⫓⫔⫕⫖⫗⫘⫙⫚⫛8⫤1⫦⫧⫨⫩1⫫⫬⫭⫮⫯⫰⫱⫲⫳9⫽{6hx⫽⃥}y7r{17ks𝒜1𝒞𝒟2𝒢2𝒥𝒦2𝒩𝒪𝒫𝒬1𝒮𝒯𝒰𝒱𝒲𝒳𝒴𝒵𝒶𝒷𝒸𝒹1𝒻1𝒽𝒾𝒿𝓀𝓁𝓂𝓃1𝓅𝓆𝓇𝓈𝓉𝓊𝓋𝓌𝓍𝓎𝓏1g𝔄𝔅1𝔇𝔈𝔉𝔊2𝔍𝔎𝔏𝔐𝔑𝔒𝔓𝔔1𝔖𝔗𝔘𝔙𝔚𝔛𝔜1𝔞𝔟𝔠𝔡𝔢𝔣𝔤𝔥𝔦𝔧𝔨𝔩𝔪𝔫𝔬𝔭𝔮𝔯𝔰𝔱𝔲𝔳𝔴𝔵𝔶𝔷𝔸𝔹1𝔻𝔼𝔽𝔾1𝕀𝕁𝕂𝕃𝕄1𝕆3𝕊𝕋𝕌𝕍𝕎𝕏𝕐1𝕒𝕓𝕔𝕕𝕖𝕗𝕘𝕙𝕚𝕛𝕜𝕝𝕞𝕟𝕠𝕡𝕢𝕣𝕤𝕥𝕦𝕧𝕨𝕩𝕪𝕫}6vefffiflffiffl", - ); +export default "9Tab;NewLine;22excl;quot;num;dollar;percnt;amp;apos;lpar;rpar;ast;plus;comma;1period;sol;10colon;semi;lt;{8402nvlt;}equals;{8421bne;}gt;{8402nvgt;}quest;commat;26lsqb;bsol;rsqb;Hat;lowbar;grave;5{106fjlig;}20lcub;vert;rcub;34nbsp;iexcl;cent;pound;curren;yen;brvbar;sect;die;copy;ordf;laquo;not;shy;reg;macr;deg;pm;sup2;sup3;acute;micro;para;middot;cedil;sup1;ordm;raquo;frac14;half;frac34;iquest;Agrave;Aacute;Acirc;Atilde;Auml;angst;AElig;Ccedil;Egrave;Eacute;Ecirc;Euml;Igrave;Iacute;Icirc;Iuml;ETH;Ntilde;Ograve;Oacute;Ocirc;Otilde;Ouml;times;Oslash;Ugrave;Uacute;Ucirc;Uuml;Yacute;THORN;szlig;agrave;aacute;acirc;atilde;auml;aring;aelig;ccedil;egrave;eacute;ecirc;euml;igrave;iacute;icirc;iuml;eth;ntilde;ograve;oacute;ocirc;otilde;ouml;div;oslash;ugrave;uacute;ucirc;uuml;yacute;thorn;yuml;Amacr;amacr;Abreve;abreve;Aogon;aogon;Cacute;cacute;Ccirc;ccirc;Cdot;cdot;Ccaron;ccaron;Dcaron;dcaron;Dstrok;dstrok;Emacr;emacr;2Edot;edot;Eogon;eogon;Ecaron;ecaron;Gcirc;gcirc;Gbreve;gbreve;Gdot;gdot;Gcedil;1Hcirc;hcirc;Hstrok;hstrok;Itilde;itilde;Imacr;imacr;2Iogon;iogon;Idot;imath;IJlig;ijlig;Jcirc;jcirc;Kcedil;kcedil;kgreen;Lacute;lacute;Lcedil;lcedil;Lcaron;lcaron;Lmidot;lmidot;Lstrok;lstrok;Nacute;nacute;Ncedil;ncedil;Ncaron;ncaron;napos;ENG;eng;Omacr;omacr;2Odblac;odblac;OElig;oelig;Racute;racute;Rcedil;rcedil;Rcaron;rcaron;Sacute;sacute;Scirc;scirc;Scedil;scedil;Scaron;scaron;Tcedil;tcedil;Tcaron;tcaron;Tstrok;tstrok;Utilde;utilde;Umacr;umacr;Ubreve;ubreve;Uring;uring;Udblac;udblac;Uogon;uogon;Wcirc;wcirc;Ycirc;ycirc;Yuml;Zacute;zacute;Zdot;zdot;Zcaron;zcaron;19fnof;34imped;63gacute;65jmath;142circ;caron;16breve;dot;ring;ogon;tilde;dblac;51DownBreve;127Alpha;Beta;Gamma;Delta;Epsilon;Zeta;Eta;Theta;Iota;Kappa;Lambda;Mu;Nu;Xi;Omicron;Pi;Rho;1Sigma;Tau;Upsilon;Phi;Chi;Psi;ohm;7alpha;beta;gamma;delta;epsi;zeta;eta;theta;iota;kappa;lambda;mu;nu;xi;omicron;pi;rho;sigmaf;sigma;tau;upsi;phi;chi;psi;omega;7thetav;Upsi;2phiv;piv;5Gammad;gammad;18kappav;rhov;3epsiv;bepsi;10IOcy;DJcy;GJcy;Jukcy;DScy;Iukcy;YIcy;Jsercy;LJcy;NJcy;TSHcy;KJcy;1Ubrcy;DZcy;Acy;Bcy;Vcy;Gcy;Dcy;IEcy;ZHcy;Zcy;Icy;Jcy;Kcy;Lcy;Mcy;Ncy;Ocy;Pcy;Rcy;Scy;Tcy;Ucy;Fcy;KHcy;TScy;CHcy;SHcy;SHCHcy;HARDcy;Ycy;SOFTcy;Ecy;YUcy;YAcy;acy;bcy;vcy;gcy;dcy;iecy;zhcy;zcy;icy;jcy;kcy;lcy;mcy;ncy;ocy;pcy;rcy;scy;tcy;ucy;fcy;khcy;tscy;chcy;shcy;shchcy;hardcy;ycy;softcy;ecy;yucy;yacy;1iocy;djcy;gjcy;jukcy;dscy;iukcy;yicy;jsercy;ljcy;njcy;tshcy;kjcy;1ubrcy;dzcy;7074ensp;emsp;emsp13;emsp14;1numsp;puncsp;thinsp;hairsp;ZeroWidthSpace;zwnj;zwj;lrm;rlm;dash;2ndash;mdash;horbar;Vert;1lsquo;rsquo;sbquo;1ldquo;rdquo;bdquo;1dagger;Dagger;bull;2nldr;mldr;9permil;pertenk;prime;Prime;tprime;bprime;3lsaquo;rsaquo;3oline;2caret;1hybull;frasl;10bsemi;7qprime;7MediumSpace;{8202ThickSpace;}NoBreak;af;it;ic;72euro;46tdot;DotDot;37Copf;2incare;4gscr;Hscr;Hfr;Hopf;planckh;hbar;Iscr;Im;Lscr;ell;1Nopf;numero;copysr;wp;Popf;Qopf;Rscr;Re;Ropf;rx;3trade;1Zopf;2mho;Zfr;iiota;2Bscr;Cfr;1escr;Escr;Fscr;1Mscr;oscr;aleph;beth;gimel;daleth;12DD;dd;ee;ii;10frac13;frac23;frac15;frac25;frac35;frac45;frac16;frac56;frac18;frac38;frac58;frac78;49larr;uarr;rarr;darr;harr;varr;nwarr;nearr;searr;swarr;nlarr;nrarr;1rarrw;{824nrarrw;}Larr;Uarr;Rarr;Darr;larrtl;rarrtl;mapstoleft;mapstoup;map;mapstodown;1larrhk;rarrhk;larrlp;rarrlp;harrw;nharr;1lsh;rsh;ldsh;rdsh;1crarr;cularr;curarr;2olarr;orarr;lharu;lhard;uharr;uharl;rharu;rhard;dharr;dharl;rlarr;udarr;lrarr;llarr;uuarr;rrarr;ddarr;lrhar;rlhar;nlArr;nhArr;nrArr;lArr;uArr;rArr;dArr;iff;vArr;nwArr;neArr;seArr;swArr;lAarr;rAarr;1zigrarr;6larrb;rarrb;15duarr;7loarr;roarr;hoarr;forall;comp;part;{824npart;}exist;nexist;empty;1Del;in;notin;1ni;notni;2prod;coprod;sum;minus;mp;plusdo;1setmn;lowast;compfn;1Sqrt;2prop;infin;angrt;ang;{8402nang;}angmsd;angsph;mid;nmid;par;npar;and;or;cap;{65024caps;}cup;{65024cups;}int;Int;tint;oint;Conint;Cconint;cwint;cwconint;awconint;there4;becaus;ratio;Colon;minusd;1mDDot;homtht;sim;{8402nvsim;}bsim;{817race;}ac;{819acE;}acd;wr;nsim;esim;{824nesim;}sime;nsime;cong;simne;ncong;ap;nap;ape;apid;{824napid;}bcong;CupCap;{8402nvap;}bump;{824nbump;}bumpe;{824nbumpe;}doteq;{824nedot;}eDot;efDot;erDot;Assign;ecolon;ecir;cire;1wedgeq;veeeq;1trie;2equest;ne;equiv;{8421bnequiv;}nequiv;1le;{8402nvle;}ge;{8402nvge;}lE;{824nlE;}gE;{824ngE;}lnE;{65024lvnE;}gnE;{65024gvnE;}ll;{824nLtv;7577nLt;}gg;{824nGtv;7577nGt;}twixt;NotCupCap;nlt;ngt;nle;nge;lsim;gsim;nlsim;ngsim;lg;gl;ntlg;ntgl;pr;sc;prcue;sccue;prsim;scsim;{824NotSucceedsTilde;}npr;nsc;sub;{8402vnsub;}sup;{8402vnsup;}nsub;nsup;sube;supe;nsube;nsupe;subne;{65024vsubne;}supne;{65024vsupne;}1cupdot;uplus;sqsub;{824NotSquareSubset;}sqsup;{824NotSquareSuperset;}sqsube;sqsupe;sqcap;{65024sqcaps;}sqcup;{65024sqcups;}oplus;ominus;otimes;osol;odot;ocir;oast;1odash;plusb;minusb;timesb;sdotb;vdash;dashv;top;bot;1models;vDash;Vdash;Vvdash;VDash;nvdash;nvDash;nVdash;nVDash;prurel;1vltri;vrtri;ltrie;{8402nvltrie;}rtrie;{8402nvrtrie;}origof;imof;mumap;hercon;intcal;veebar;1barvee;angrtvb;lrtri;Wedge;Vee;xcap;xcup;diam;sdot;Star;divonx;bowtie;ltimes;rtimes;lthree;rthree;bsime;cuvee;cuwed;Sub;Sup;Cap;Cup;fork;epar;ltdot;gtdot;Ll;{824nLl;}Gg;{824nGg;}leg;{65024lesg;}gel;{65024gesl;}2cuepr;cuesc;nprcue;nsccue;nsqsube;nsqsupe;2lnsim;gnsim;prnsim;scnsim;nltri;nrtri;nltrie;nrtrie;vellip;ctdot;utdot;dtdot;disin;isinsv;isins;isindot;{824notindot;}notinvc;notinvb;1isinE;{824notinE;}nisd;xnis;nis;notnivc;notnivb;6barwed;Barwed;1lceil;rceil;lfloor;rfloor;drcrop;dlcrop;urcrop;ulcrop;bnot;1profline;profsurf;1telrec;target;5ulcorn;urcorn;dlcorn;drcorn;2frown;smile;9cylcty;profalar;7topbot;6ovbar;1solbar;60angzarr;51lmoust;rmoust;2tbrk;bbrk;bbrktbrk;37OverParenthesis;UnderParenthesis;OverBrace;UnderBrace;2trpezium;4elinters;59blank;164oS;55boxh;1boxv;9boxdr;3boxdl;3boxur;3boxul;3boxvr;7boxvl;7boxhd;7boxhu;7boxvh;19boxH;boxV;boxdR;boxDr;boxDR;boxdL;boxDl;boxDL;boxuR;boxUr;boxUR;boxuL;boxUl;boxUL;boxvR;boxVr;boxVR;boxvL;boxVl;boxVL;boxHd;boxhD;boxHD;boxHu;boxhU;boxHU;boxvH;boxVh;boxVH;19uhblk;3lhblk;3block;8blk14;blk12;blk34;13squ;8squf;EmptyVerySmallSquare;1rect;marker;2fltns;1xutri;utrif;utri;2rtrif;rtri;3xdtri;dtrif;dtri;2ltrif;ltri;6loz;cir;32tridot;2xcirc;8ultri;urtri;lltri;EmptySmallSquare;FilledSmallSquare;8starf;star;7phone;49female;1male;29spades;2clubs;1hearts;diams;3sung;2flat;natur;sharp;163check;3cross;8malt;21sext;33VerticalSeparator;25lbbrk;rbbrk;84bsolhsub;suphsol;28lobrk;robrk;lang;rang;Lang;Rang;loang;roang;7xlarr;xrarr;xharr;xlArr;xrArr;xhArr;1xmap;2dzigrarr;258nvlArr;nvrArr;nvHarr;Map;6lbarr;rbarr;lBarr;rBarr;RBarr;DDotrahd;UpArrowBar;DownArrowBar;2Rarrtl;2latail;ratail;lAtail;rAtail;larrfs;rarrfs;larrbfs;rarrbfs;2nwarhk;nearhk;searhk;swarhk;nwnear;toea;tosa;swnwar;8rarrc;{824nrarrc;}1cudarrr;ldca;rdca;cudarrl;larrpl;2curarrm;cularrp;7rarrpl;2harrcir;Uarrocir;lurdshar;ldrushar;2LeftRightVector;RightUpDownVector;DownLeftRightVector;LeftUpDownVector;LeftVectorBar;RightVectorBar;RightUpVectorBar;RightDownVectorBar;DownLeftVectorBar;DownRightVectorBar;LeftUpVectorBar;LeftDownVectorBar;LeftTeeVector;RightTeeVector;RightUpTeeVector;RightDownTeeVector;DownLeftTeeVector;DownRightTeeVector;LeftUpTeeVector;LeftDownTeeVector;lHar;uHar;rHar;dHar;luruhar;ldrdhar;ruluhar;rdldhar;lharul;llhard;rharul;lrhard;udhar;duhar;RoundImplies;erarr;simrarr;larrsim;rarrsim;rarrap;ltlarr;1gtrarr;subrarr;1suplarr;lfisht;rfisht;ufisht;dfisht;5lopar;ropar;4lbrke;rbrke;lbrkslu;rbrksld;lbrksld;rbrkslu;langd;rangd;lparlt;rpargt;gtlPar;ltrPar;3vzigzag;1vangrt;angrtvbd;6ange;range;dwangle;uwangle;angmsdaa;angmsdab;angmsdac;angmsdad;angmsdae;angmsdaf;angmsdag;angmsdah;bemptyv;demptyv;cemptyv;raemptyv;laemptyv;ohbar;omid;opar;1operp;1olcross;odsold;1olcir;ofcir;olt;ogt;cirscir;cirE;solb;bsolb;3boxbox;3trisb;rtriltri;LeftTriangleBar;{824NotLeftTriangleBar;}RightTriangleBar;{824NotRightTriangleBar;}11iinfin;infintie;nvinfin;4eparsl;smeparsl;eqvparsl;5lozf;8RuleDelayed;1dsol;9xodot;xoplus;xotime;1xuplus;1xsqcup;5qint;fpartint;2cirfnint;awint;rppolint;scpolint;npolint;pointint;quatint;intlarhk;10pluscir;plusacir;simplus;plusdu;plussim;plustwo;1mcomma;minusdu;2loplus;roplus;Cross;timesd;timesbar;1smashp;lotimes;rotimes;otimesas;Otimes;odiv;triplus;triminus;tritime;iprod;2amalg;capdot;1ncup;ncap;capand;cupor;cupcap;capcup;cupbrcap;capbrcup;cupcup;capcap;ccups;ccaps;2ccupssm;2And;Or;andand;oror;orslope;andslope;1andv;orv;andd;ord;1wedbar;6sdote;3simdot;2congdot;{824ncongdot;}easter;apacir;apE;{824napE;}eplus;pluse;Esim;Colone;Equal;1eDDot;equivDD;ltcir;gtcir;ltquest;gtquest;les;{824nles;}ges;{824nges;}lesdot;gesdot;lesdoto;gesdoto;lesdotor;gesdotol;lap;gap;lne;gne;lnap;gnap;lEg;gEl;lsime;gsime;lsimg;gsiml;lgE;glE;lesges;gesles;els;egs;elsdot;egsdot;el;eg;2siml;simg;simlE;simgE;LessLess;{824NotNestedLessLess;}GreaterGreater;{824NotNestedGreaterGreater;}1glj;gla;ltcc;gtcc;lescc;gescc;smt;lat;smte;{65024smtes;}late;{65024lates;}bumpE;pre;{824npre;}sce;{824nsce;}2prE;scE;prnE;scnE;prap;scap;prnap;scnap;Pr;Sc;subdot;supdot;subplus;supplus;submult;supmult;subedot;supedot;subE;{824nsubE;}supE;{824nsupE;}subsim;supsim;2subnE;{65024vsubnE;}supnE;{65024vsupnE;}2csub;csup;csube;csupe;subsup;supsub;subsub;supsup;suphsub;supdsub;forkv;topfork;mlcp;8Dashv;1Vdashl;Barv;vBar;vBarv;1Vbar;Not;bNot;rnmid;cirmid;midcir;topcir;nhpar;parsim;9parsl;{8421nparsl;}53250fflig;filig;fllig;ffilig;ffllig;55703Ascr;1Cscr;Dscr;2Gscr;2Jscr;Kscr;2Nscr;Oscr;Pscr;Qscr;1Sscr;Tscr;Uscr;Vscr;Wscr;Xscr;Yscr;Zscr;ascr;bscr;cscr;dscr;1fscr;1hscr;iscr;jscr;kscr;lscr;mscr;nscr;1pscr;qscr;rscr;sscr;tscr;uscr;vscr;wscr;xscr;yscr;zscr;52Afr;Bfr;1Dfr;Efr;Ffr;Gfr;2Jfr;Kfr;Lfr;Mfr;Nfr;Ofr;Pfr;Qfr;1Sfr;Tfr;Ufr;Vfr;Wfr;Xfr;Yfr;1afr;bfr;cfr;dfr;efr;ffr;gfr;hfr;ifr;jfr;kfr;lfr;mfr;nfr;ofr;pfr;qfr;rfr;sfr;tfr;ufr;vfr;wfr;xfr;yfr;zfr;Aopf;Bopf;1Dopf;Eopf;Fopf;Gopf;1Iopf;Jopf;Kopf;Lopf;Mopf;1Oopf;3Sopf;Topf;Uopf;Vopf;Wopf;Xopf;Yopf;1aopf;bopf;copf;dopf;eopf;fopf;gopf;hopf;iopf;jopf;kopf;lopf;mopf;nopf;oopf;popf;qopf;ropf;sopf;topf;uopf;vopf;wopf;xopf;yopf;zopf;"; diff --git a/src/index.spec.ts b/src/index.spec.ts index 937e7717..d48186f5 100644 --- a/src/index.spec.ts +++ b/src/index.spec.ts @@ -63,7 +63,7 @@ describe("Documents", () => { level, mode: entities.EncodingMode.ASCII, }), - ).toBe("Great #'s of 🎁")); + ).toBe("Great #'s of 🎁")); }); }); @@ -100,10 +100,10 @@ describe("Astral entities", () => { expect(entities.decode(`&#x${c};`)).toBe(value)); it.each(astral)("should encode &#x%s;", (c, value) => - expect(entities.encode(value)).toBe(`&#x${c};`)); + expect(entities.encode(value)).toBe(`&#${Number.parseInt(c, 16)};`)); it.each(astral)("should escape &#x%s;", (c, value) => - expect(entities.escape(value)).toBe(`&#x${c};`)); + expect(entities.escape(value)).toBe(`&#${Number.parseInt(c, 16)};`)); it.each(astralSpecial)(String.raw`should decode special \u%s`, (c, value) => expect(entities.decode(`&#x${c};`)).toBe(value)); diff --git a/src/internal/encode-shared.ts b/src/internal/encode-shared.ts deleted file mode 100644 index 02d02d10..00000000 --- a/src/internal/encode-shared.ts +++ /dev/null @@ -1,123 +0,0 @@ -/** - * A node inside the encoding trie used by `encode.ts`. - * - * There are two physical shapes to minimize allocations and lookup cost: - * - * 1. Leaf node (string) - * - A plain string (already in the form `"&name;"`). - * - Represents a terminal match with no children. - * - * 2. Branch / value node (object) - */ -export type EncodeTrieNode = - | string - | { - /** - * Entity value for the current code point sequence (wrapped: `&...;`). - * Present when the path to this node itself is a valid named entity. - */ - value: string | undefined; - /** If a number, the next code unit of the only next character. */ - next: number | Map; - /** If next is a number, `nextValue` contains the entity value. */ - nextValue?: string; - }; - -/** - * Parse a compact encode trie string into a Map structure used for encoding. - * - * Format per entry (ascending code points using delta encoding): - * [&name;][{}] -- diff omitted when 0 - * Where diff = currentKey - previousKey - 1 (first entry stores absolute key). - * `&name;` is the entity value (already wrapped); a following `{` denotes children. - * @param serialized Serialized text fragment to encode. - */ -export function parseEncodeTrie( - serialized: string, -): Map { - const top = new Map(); - const totalLength = serialized.length; - let cursor = 0; - let lastTopKey = -1; - - function readDiff(): number { - const start = cursor; - while (cursor < totalLength) { - const char = serialized.charAt(cursor); - - if ((char < "0" || char > "9") && (char < "a" || char > "z")) { - break; - } - cursor++; - } - if (cursor === start) return 0; - return Number.parseInt(serialized.slice(start, cursor), 36); - } - - function readEntity(): string { - if (serialized[cursor] !== "&") { - throw new Error(`Child entry missing value near index ${cursor}`); - } - - // Cursor currently points at '&' - const start = cursor; - const end = serialized.indexOf(";", cursor + 1); - if (end === -1) { - throw new Error(`Unterminated entity starting at index ${start}`); - } - cursor = end + 1; // Move past ';' - return serialized.slice(start, cursor); // Includes & ... ; - } - - while (cursor < totalLength) { - const keyDiff = readDiff(); - const key = lastTopKey === -1 ? keyDiff : lastTopKey + keyDiff + 1; - - let value: string | undefined; - if (serialized[cursor] === "&") value = readEntity(); - - if (serialized[cursor] === "{") { - cursor++; // Skip '{' - // Parse first child - let diff = readDiff(); - let childKey = diff; // First key (lastChildKey = -1) - const firstValue = readEntity(); - if (serialized[cursor] === "{") { - throw new Error("Unexpected nested '{' beyond depth 2"); - } - // If end of block -> single child optimization - if (serialized[cursor] === "}") { - top.set(key, { value, next: childKey, nextValue: firstValue }); - cursor++; // Skip '}' - } else { - const childMap = new Map([ - [childKey, firstValue], - ]); - let lastChildKey = childKey; - while (cursor < totalLength && serialized[cursor] !== "}") { - diff = readDiff(); - childKey = lastChildKey + diff + 1; - const childValue = readEntity(); - if (serialized[cursor] === "{") { - throw new Error("Unexpected nested '{' beyond depth 2"); - } - childMap.set(childKey, childValue); - lastChildKey = childKey; - } - if (serialized[cursor] !== "}") { - throw new Error("Unterminated child block"); - } - cursor++; // Skip '}' - top.set(key, { value, next: childMap }); - } - } else if (value === undefined) { - throw new Error( - `Malformed encode trie: missing value at index ${cursor}`, - ); - } else { - top.set(key, value); - } - lastTopKey = key; - } - return top; -}