diff --git a/.changeset/config.json b/.changeset/config.json index 43989e1..b4a5cbc 100644 --- a/.changeset/config.json +++ b/.changeset/config.json @@ -2,8 +2,8 @@ "$schema": "https://unpkg.com/@changesets/config@3.1.3/schema.json", "changelog": "@changesets/cli/changelog", "commit": false, - "fixed": [], - "linked": [["@ritojs/core", "@ritojs/kit", "@ritojs/react"]], + "fixed": [["@ritojs/core", "@ritojs/kit", "@ritojs/react"]], + "linked": [], "access": "public", "baseBranch": "master", "updateInternalDependencies": "patch", diff --git a/packages/kit/CHANGELOG.md b/packages/kit/CHANGELOG.md index 4397633..0407daa 100644 --- a/packages/kit/CHANGELOG.md +++ b/packages/kit/CHANGELOG.md @@ -1,5 +1,14 @@ # @ritojs/kit +## 0.12.1 + +### Patch Changes + +- Updated dependencies [9c1688b] +- Updated dependencies [9c1688b] +- Updated dependencies [9c1688b] + - @ritojs/core@0.12.1 + ## 0.12.0 ### Minor Changes diff --git a/packages/kit/package.json b/packages/kit/package.json index a24e017..797ddf5 100644 --- a/packages/kit/package.json +++ b/packages/kit/package.json @@ -1,7 +1,7 @@ { "name": "@ritojs/kit", "type": "module", - "version": "0.12.0", + "version": "0.12.1", "description": "Framework-agnostic controller, overlays, and transitions for @ritojs/core", "license": "AGPL-3.0-only", "main": "./dist/index.mjs", diff --git a/packages/react/CHANGELOG.md b/packages/react/CHANGELOG.md index e2f043e..9f4dbd9 100644 --- a/packages/react/CHANGELOG.md +++ b/packages/react/CHANGELOG.md @@ -1,5 +1,15 @@ # @ritojs/react +## 0.12.1 + +### Patch Changes + +- Updated dependencies [9c1688b] +- Updated dependencies [9c1688b] +- Updated dependencies [9c1688b] + - @ritojs/core@0.12.1 + - @ritojs/kit@0.12.1 + ## 0.12.0 ### Minor Changes diff --git a/packages/react/package.json b/packages/react/package.json index 62764a6..2ce6732 100644 --- a/packages/react/package.json +++ b/packages/react/package.json @@ -1,7 +1,7 @@ { "name": "@ritojs/react", "type": "module", - "version": "0.12.0", + "version": "0.12.1", "description": "React hooks and components for @ritojs/core", "license": "AGPL-3.0-only", "main": "./dist/index.mjs", diff --git a/packages/rito/CHANGELOG.md b/packages/rito/CHANGELOG.md index aa6df47..0ce8a59 100644 --- a/packages/rito/CHANGELOG.md +++ b/packages/rito/CHANGELOG.md @@ -1,5 +1,30 @@ # @ritojs/core +## 0.12.1 + +### Patch Changes + +- 9c1688b: Open spec-violating EPUBs that earlier failed to load. The OPF parser now + defaults missing `dc:title` / `dc:language` / `dc:identifier` to an empty string + with a warning instead of throwing (the structural `` / `` + checks stay strict), and the ZIP reader percent-decodes container paths on a + lookup miss, so a manifest href like `Text/Character%20Profile.xhtml` resolves + to the literal `Text/Character Profile.xhtml` archive entry. +- 9c1688b: Resolve in-content illustrations that previously rendered as broken images. + `loadEpub` now indexes every image file present in the archive — not only those + declared in the OPF manifest — so spec-violating books that reference undeclared + illustrations still get image data. Manifest resource reads are individually + tolerant (a single missing/mislabeled entry is skipped with a warning instead of + aborting the load), and href resolution percent-decodes on miss so references + like `Images/My%20Pic.jpg` match a literal `Images/My Pic.jpg` entry. +- 9c1688b: Parse EPUB chapters whose XHTML is invalid in strict XML. The source normalizer + now escapes stray ampersands (e.g. `Schmidt & Bender`), remaps HTML named + entities undefined without a DTD (`©`, `—`, ` `, …) to numeric + references, and strips characters illegal in XML (C0 controls, `U+FFFE/FFFF`, + lone surrogates, and numeric refs pointing to them), while leaving comments and + CDATA sections untouched. Chapters that previously failed with errors such as + `EntityRef: expecting ';'` or `PCDATA invalid Char value 31` now parse. + ## 0.12.0 ### Minor Changes diff --git a/packages/rito/package.json b/packages/rito/package.json index b059058..d766d1d 100644 --- a/packages/rito/package.json +++ b/packages/rito/package.json @@ -1,7 +1,7 @@ { "name": "@ritojs/core", "type": "module", - "version": "0.12.0", + "version": "0.12.1", "description": "EPUB rendering engine with a Web Canvas backend", "license": "AGPL-3.0-only", "main": "./dist/index.mjs", diff --git a/packages/rito/src/parser/epub/package-parser.ts b/packages/rito/src/parser/epub/package-parser.ts index 01c42e9..32675fc 100644 --- a/packages/rito/src/parser/epub/package-parser.ts +++ b/packages/rito/src/parser/epub/package-parser.ts @@ -1,10 +1,11 @@ import type { ManifestItem, PackageDocument, PackageMetadata, SpineItem } from './types'; import { EpubParseError } from './errors'; +import type { Logger } from '../../utils/logger'; /** * Parse an OPF package document XML string into a PackageDocument. */ -export function parsePackageDocument(opfXml: string): PackageDocument { +export function parsePackageDocument(opfXml: string, logger?: Logger): PackageDocument { const doc = new DOMParser().parseFromString(opfXml, 'application/xml'); const parserError = doc.querySelector('parsererror'); @@ -12,30 +13,37 @@ export function parsePackageDocument(opfXml: string): PackageDocument { throw new EpubParseError(`Invalid OPF package document: ${parserError.textContent}`); } - const metadata = parseMetadata(doc); + const metadata = parseMetadata(doc, logger); const manifest = parseManifest(doc); const spine = parseSpine(doc); return { metadata, manifest, spine }; } -function parseMetadata(doc: Document): PackageMetadata { +/** + * Parse Dublin Core metadata. The EPUB spec requires ``, + * `` and ``, but spec-violating files (e.g. some + * Sigil exports) omit them while remaining perfectly readable. Rather than + * refusing to open such books, missing fields fall back to an empty string and + * a warning — the structural ``/`` checks below stay strict. + */ +function parseMetadata(doc: Document, logger?: Logger): PackageMetadata { const title = getMetadataText(doc, 'title'); const language = getMetadataText(doc, 'language'); const identifier = getMetadataText(doc, 'identifier'); - if (!title) { - throw new EpubParseError('Missing required in package metadata'); - } - if (!language) { - throw new EpubParseError('Missing required in package metadata'); - } + if (!title) logger?.warn('Missing in package metadata; using empty title'); + if (!language) logger?.warn('Missing in package metadata; using empty language'); if (!identifier) { - throw new EpubParseError('Missing required in package metadata'); + logger?.warn('Missing in package metadata; using empty identifier'); } const creator = getMetadataText(doc, 'creator'); - const result: PackageMetadata = { title, language, identifier }; + const result: PackageMetadata = { + title: title ?? '', + language: language ?? '', + identifier: identifier ?? '', + }; if (creator) { return { ...result, creator }; diff --git a/packages/rito/src/parser/epub/zip-reader.ts b/packages/rito/src/parser/epub/zip-reader.ts index 1827681..ac46801 100644 --- a/packages/rito/src/parser/epub/zip-reader.ts +++ b/packages/rito/src/parser/epub/zip-reader.ts @@ -15,7 +15,10 @@ export function createZipReader(data: ArrayBuffer): ZipReader { return { readFile(path: string): Uint8Array { if (!entries) throw new EpubParseError('ZipReader has been closed'); - const entry = entries[path]; + // OPF/NCX hrefs are URLs, so they may be percent-encoded (e.g. + // "Character%20Profile.xhtml") while the actual zip entry name is literal. + // Try the raw path first, then fall back to its percent-decoded form. + const entry = entries[path] ?? entries[percentDecodePath(path)]; if (!entry) { throw new EpubParseError(`File not found in EPUB archive: ${path}`); } @@ -36,3 +39,13 @@ export function createZipReader(data: ArrayBuffer): ZipReader { }, }; } + +/** Percent-decode a container path, falling back to the raw path if malformed. */ +function percentDecodePath(path: string): string { + if (!path.includes('%')) return path; + try { + return decodeURIComponent(path); + } catch { + return path; + } +} diff --git a/packages/rito/src/parser/xhtml/xhtml-named-entities.ts b/packages/rito/src/parser/xhtml/xhtml-named-entities.ts new file mode 100644 index 0000000..be7f56d --- /dev/null +++ b/packages/rito/src/parser/xhtml/xhtml-named-entities.ts @@ -0,0 +1,258 @@ +// HTML4 named character references (the canonical 252-entry set), minus the +// five XML-predefined entities (amp/lt/gt/quot/apos) which need no remapping. +// +// In `application/xhtml+xml` parsing there is no DTD, so named entities such as +// `©` or `—` are undefined and abort the parse. EPUBs in the wild use +// them anyway, so we remap each to its numeric character reference before parsing. +// +// Generated from Python `html.entities.name2codepoint`. Do not edit by hand. +export const HTML_NAMED_ENTITIES: Readonly> = { + AElig: 198, + Aacute: 193, + Acirc: 194, + Agrave: 192, + Alpha: 913, + Aring: 197, + Atilde: 195, + Auml: 196, + Beta: 914, + Ccedil: 199, + Chi: 935, + Dagger: 8225, + Delta: 916, + ETH: 208, + Eacute: 201, + Ecirc: 202, + Egrave: 200, + Epsilon: 917, + Eta: 919, + Euml: 203, + Gamma: 915, + Iacute: 205, + Icirc: 206, + Igrave: 204, + Iota: 921, + Iuml: 207, + Kappa: 922, + Lambda: 923, + Mu: 924, + Ntilde: 209, + Nu: 925, + OElig: 338, + Oacute: 211, + Ocirc: 212, + Ograve: 210, + Omega: 937, + Omicron: 927, + Oslash: 216, + Otilde: 213, + Ouml: 214, + Phi: 934, + Pi: 928, + Prime: 8243, + Psi: 936, + Rho: 929, + Scaron: 352, + Sigma: 931, + THORN: 222, + Tau: 932, + Theta: 920, + Uacute: 218, + Ucirc: 219, + Ugrave: 217, + Upsilon: 933, + Uuml: 220, + Xi: 926, + Yacute: 221, + Yuml: 376, + Zeta: 918, + aacute: 225, + acirc: 226, + acute: 180, + aelig: 230, + agrave: 224, + alefsym: 8501, + alpha: 945, + and: 8743, + ang: 8736, + aring: 229, + asymp: 8776, + atilde: 227, + auml: 228, + bdquo: 8222, + beta: 946, + brvbar: 166, + bull: 8226, + cap: 8745, + ccedil: 231, + cedil: 184, + cent: 162, + chi: 967, + circ: 710, + clubs: 9827, + cong: 8773, + copy: 169, + crarr: 8629, + cup: 8746, + curren: 164, + dArr: 8659, + dagger: 8224, + darr: 8595, + deg: 176, + delta: 948, + diams: 9830, + divide: 247, + eacute: 233, + ecirc: 234, + egrave: 232, + empty: 8709, + emsp: 8195, + ensp: 8194, + epsilon: 949, + equiv: 8801, + eta: 951, + eth: 240, + euml: 235, + euro: 8364, + exist: 8707, + fnof: 402, + forall: 8704, + frac12: 189, + frac14: 188, + frac34: 190, + frasl: 8260, + gamma: 947, + ge: 8805, + hArr: 8660, + harr: 8596, + hearts: 9829, + hellip: 8230, + iacute: 237, + icirc: 238, + iexcl: 161, + igrave: 236, + image: 8465, + infin: 8734, + int: 8747, + iota: 953, + iquest: 191, + isin: 8712, + iuml: 239, + kappa: 954, + lArr: 8656, + lambda: 955, + lang: 9001, + laquo: 171, + larr: 8592, + lceil: 8968, + ldquo: 8220, + le: 8804, + lfloor: 8970, + lowast: 8727, + loz: 9674, + lrm: 8206, + lsaquo: 8249, + lsquo: 8216, + macr: 175, + mdash: 8212, + micro: 181, + middot: 183, + minus: 8722, + mu: 956, + nabla: 8711, + nbsp: 160, + ndash: 8211, + ne: 8800, + ni: 8715, + not: 172, + notin: 8713, + nsub: 8836, + ntilde: 241, + nu: 957, + oacute: 243, + ocirc: 244, + oelig: 339, + ograve: 242, + oline: 8254, + omega: 969, + omicron: 959, + oplus: 8853, + or: 8744, + ordf: 170, + ordm: 186, + oslash: 248, + otilde: 245, + otimes: 8855, + ouml: 246, + para: 182, + part: 8706, + permil: 8240, + perp: 8869, + phi: 966, + pi: 960, + piv: 982, + plusmn: 177, + pound: 163, + prime: 8242, + prod: 8719, + prop: 8733, + psi: 968, + rArr: 8658, + radic: 8730, + rang: 9002, + raquo: 187, + rarr: 8594, + rceil: 8969, + rdquo: 8221, + real: 8476, + reg: 174, + rfloor: 8971, + rho: 961, + rlm: 8207, + rsaquo: 8250, + rsquo: 8217, + sbquo: 8218, + scaron: 353, + sdot: 8901, + sect: 167, + shy: 173, + sigma: 963, + sigmaf: 962, + sim: 8764, + spades: 9824, + sub: 8834, + sube: 8838, + sum: 8721, + sup: 8835, + sup1: 185, + sup2: 178, + sup3: 179, + supe: 8839, + szlig: 223, + tau: 964, + there4: 8756, + theta: 952, + thetasym: 977, + thinsp: 8201, + thorn: 254, + tilde: 732, + times: 215, + trade: 8482, + uArr: 8657, + uacute: 250, + uarr: 8593, + ucirc: 251, + ugrave: 249, + uml: 168, + upsih: 978, + upsilon: 965, + uuml: 252, + weierp: 8472, + xi: 958, + yacute: 253, + yen: 165, + yuml: 255, + zeta: 950, + zwj: 8205, + zwnj: 8204, +}; diff --git a/packages/rito/src/parser/xhtml/xhtml-source-normalizer.ts b/packages/rito/src/parser/xhtml/xhtml-source-normalizer.ts index b4aa68b..d463d83 100644 --- a/packages/rito/src/parser/xhtml/xhtml-source-normalizer.ts +++ b/packages/rito/src/parser/xhtml/xhtml-source-normalizer.ts @@ -1,16 +1,64 @@ +import { HTML_NAMED_ENTITIES } from './xhtml-named-entities'; + const XML_DECLARATION_RE = /^(\uFEFF?)<\?xml\s+([^?]*?)\?>/; const SINGLE_QUOTED_XML_DECLARATION_ATTRIBUTE_RE = /\b(version|encoding|standalone)='([^']*)'/g; -const XHTML_NBSP_ENTITY_RE = / /g; + +// Matches `&` optionally followed by a numeric reference or a named token. +// Group 1 (when present) is the entity body without the leading `&`. +const AMPERSAND_RE = /&(#x[0-9a-fA-F]+;|#[0-9]+;|[a-zA-Z][a-zA-Z0-9]*;)?/g; + +// Regions where `&` is already literal and must be preserved verbatim: +// comments and CDATA sections (common around inline