From ac09cda9366f2498e557afa25a0b7d34eb3dc166 Mon Sep 17 00:00:00 2001 From: PrinsFrank <25006490+PrinsFrank@users.noreply.github.com> Date: Thu, 11 Jun 2026 20:21:15 +0200 Subject: [PATCH] Recover byte offsets from raw stream when cross reference source is corrupt --- .../CrossReferenceSourceParser.php | 12 +++- .../ObjectPositionsFromRawStreamParser.php | 67 +++++++++++++++++++ .../Source/CrossReferenceSource.php | 11 +++ .../Source/RecoveredCrossReferenceSource.php | 30 +++++++++ .../Source/Section/CrossReferenceSection.php | 11 +++ .../SubSection/CrossReferenceSubSection.php | 21 ++++++ ...ObjectPositionsFromRawStreamParserTest.php | 34 ++++++++++ 7 files changed, 185 insertions(+), 1 deletion(-) create mode 100644 src/Document/CrossReference/RawStream/ObjectPositionsFromRawStreamParser.php create mode 100644 src/Document/CrossReference/Source/RecoveredCrossReferenceSource.php create mode 100644 tests/Unit/Document/CrossReference/RawStream/ObjectPositionsFromRawStreamParserTest.php diff --git a/src/Document/CrossReference/CrossReferenceSourceParser.php b/src/Document/CrossReference/CrossReferenceSourceParser.php index 655d3605..9ee5232b 100644 --- a/src/Document/CrossReference/CrossReferenceSourceParser.php +++ b/src/Document/CrossReference/CrossReferenceSourceParser.php @@ -3,7 +3,9 @@ namespace PrinsFrank\PdfParser\Document\CrossReference; +use PrinsFrank\PdfParser\Document\CrossReference\RawStream\ObjectPositionsFromRawStreamParser; use PrinsFrank\PdfParser\Document\CrossReference\Source\CrossReferenceSource; +use PrinsFrank\PdfParser\Document\CrossReference\Source\RecoveredCrossReferenceSource; use PrinsFrank\PdfParser\Document\CrossReference\Stream\CrossReferenceStreamParser; use PrinsFrank\PdfParser\Document\CrossReference\Table\CrossReferenceTableParser; use PrinsFrank\PdfParser\Document\Dictionary\DictionaryKey\DictionaryKey; @@ -80,7 +82,15 @@ public static function parse(Stream $stream): CrossReferenceSource { $crossReferenceSections[] = $currentCrossReferenceSection; } - return new CrossReferenceSource(... $crossReferenceSections); + $crossReferenceSource = new CrossReferenceSource(... $crossReferenceSections); + if ($crossReferenceSource->hasInvalidByteOffset($stream)) { + return new RecoveredCrossReferenceSource( + ObjectPositionsFromRawStreamParser::parse($stream), + ...$crossReferenceSections, + ); + } + + return $crossReferenceSource; } private static function getCrossReferenceType(Stream $stream, int $byteOffsetLastCrossReferenceSection, int $byteOffsetEndOfCurrentLine): ?CrossReferenceType { diff --git a/src/Document/CrossReference/RawStream/ObjectPositionsFromRawStreamParser.php b/src/Document/CrossReference/RawStream/ObjectPositionsFromRawStreamParser.php new file mode 100644 index 00000000..302e41f8 --- /dev/null +++ b/src/Document/CrossReference/RawStream/ObjectPositionsFromRawStreamParser.php @@ -0,0 +1,67 @@ + */ + public static function parse(Stream $stream): array { + $inObjNr = $inObjGenerationNumber = $pendingObjMarker = false; + $startObjNrOffset = $objNrBuffer = $objMarkerBuffer = null; + $discoveredObjects = []; + foreach ($stream->chars(0, $stream->getSizeInBytes()) as $byteOffset => $char) { + if ($char === ' ') { + if ($inObjNr === true) { + $inObjNr = false; + $inObjGenerationNumber = true; + } elseif ($inObjGenerationNumber === true) { + $inObjGenerationNumber = false; + $pendingObjMarker = true; + } else { + $inObjNr = $inObjGenerationNumber = $pendingObjMarker = false; + $startObjNrOffset = $objNrBuffer = $objMarkerBuffer = null; + } + } else if ($char === '0' + || $char === '1' + || $char === '2' + || $char === '3' + || $char === '4' + || $char === '5' + || $char === '6' + || $char === '7' + || $char === '8' + || $char === '9') { + if ($pendingObjMarker === true) { + $pendingObjMarker = false; + $objNrBuffer = null; + } elseif ($inObjGenerationNumber === true) { + } elseif ($inObjNr === false) { + $inObjNr = true; + $startObjNrOffset = $byteOffset; + $objNrBuffer = $char; + } elseif ($inObjNr === true) { + $objNrBuffer .= $char; + } + } elseif ($pendingObjMarker === true) { + if ($objMarkerBuffer === null && $char === 'o') { + $objMarkerBuffer = $char; + } elseif ($objMarkerBuffer === 'o' && $char === 'b') { + $objMarkerBuffer .= $char; + } elseif ($objMarkerBuffer === 'ob' && $char === 'j') { + $discoveredObjects[$startObjNrOffset] = (int) $objNrBuffer; + $inObjNr = $inObjGenerationNumber = $pendingObjMarker = false; + $startObjNrOffset = $objNrBuffer = $objMarkerBuffer = null; + } else { + $inObjNr = $inObjGenerationNumber = $pendingObjMarker = false; + $startObjNrOffset = $objNrBuffer = $objMarkerBuffer = null; + } + } else { + $inObjNr = $inObjGenerationNumber = $pendingObjMarker = false; + $startObjNrOffset = $objNrBuffer = $objMarkerBuffer = null; + } + } + + return $discoveredObjects; + } +} diff --git a/src/Document/CrossReference/Source/CrossReferenceSource.php b/src/Document/CrossReference/Source/CrossReferenceSource.php index 7d3e68a4..31582401 100644 --- a/src/Document/CrossReference/Source/CrossReferenceSource.php +++ b/src/Document/CrossReference/Source/CrossReferenceSource.php @@ -13,6 +13,7 @@ use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Name\NameValue; use PrinsFrank\PdfParser\Document\Dictionary\DictionaryValue\Reference\ReferenceValue; use PrinsFrank\PdfParser\Exception\ParseFailureException; +use PrinsFrank\PdfParser\Stream\Stream; /** Can be both from a crossReferenceTable or a crossReferenceStream */ class CrossReferenceSource { @@ -75,4 +76,14 @@ public function getFirstId(): string { return $firstId; } + + public function hasInvalidByteOffset(Stream $stream): bool { + foreach ($this->crossReferenceSections as $crossReferenceSection) { + if ($crossReferenceSection->hasInvalidByteOffset($stream)) { + return true; + } + } + + return false; + } } diff --git a/src/Document/CrossReference/Source/RecoveredCrossReferenceSource.php b/src/Document/CrossReference/Source/RecoveredCrossReferenceSource.php new file mode 100644 index 00000000..8dd1d444 --- /dev/null +++ b/src/Document/CrossReference/Source/RecoveredCrossReferenceSource.php @@ -0,0 +1,30 @@ + $recoveredByteOffsetMap where the key is the byte offset and the value the object nr + * @no-named-arguments + */ + public function __construct( + private array $recoveredByteOffsetMap, + CrossReferenceSection... $crossReferenceSections, + ) { + parent::__construct(...$crossReferenceSections); + } + + public function getCrossReferenceEntry(int $objNumber): CrossReferenceEntryInUseObject|CrossReferenceEntryCompressed|null { + foreach ($this->recoveredByteOffsetMap as $byteOffset => $recoveredObjNr) { + if ($recoveredObjNr === $objNumber) { + return new CrossReferenceEntryInUseObject($byteOffset, 0); + } + } + + return parent::getCrossReferenceEntry($objNumber); + } +} diff --git a/src/Document/CrossReference/Source/Section/CrossReferenceSection.php b/src/Document/CrossReference/Source/Section/CrossReferenceSection.php index ea9e4897..2e6efc22 100644 --- a/src/Document/CrossReference/Source/Section/CrossReferenceSection.php +++ b/src/Document/CrossReference/Source/Section/CrossReferenceSection.php @@ -6,6 +6,7 @@ use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryCompressed; use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryInUseObject; use PrinsFrank\PdfParser\Document\Dictionary\Dictionary; +use PrinsFrank\PdfParser\Stream\Stream; /** There are multiple crossReference sections if there are incremental updates. See 7.5.6 */ readonly class CrossReferenceSection { @@ -29,4 +30,14 @@ public function getCrossReferenceEntry(int $objNumber): CrossReferenceEntryInUse return null; } + + public function hasInvalidByteOffset(Stream $stream): bool { + foreach ($this->crossReferenceSubSections as $crossReferenceSubSection) { + if ($crossReferenceSubSection->hasInvalidByteOffset($stream)) { + return true; + } + } + + return false; + } } diff --git a/src/Document/CrossReference/Source/Section/SubSection/CrossReferenceSubSection.php b/src/Document/CrossReference/Source/Section/SubSection/CrossReferenceSubSection.php index a4347b79..31ec1e42 100644 --- a/src/Document/CrossReference/Source/Section/SubSection/CrossReferenceSubSection.php +++ b/src/Document/CrossReference/Source/Section/SubSection/CrossReferenceSubSection.php @@ -8,6 +8,7 @@ use PrinsFrank\PdfParser\Document\CrossReference\Source\Section\SubSection\Entry\CrossReferenceEntryInUseObject; use PrinsFrank\PdfParser\Exception\InvalidArgumentException; use PrinsFrank\PdfParser\Exception\RuntimeException; +use PrinsFrank\PdfParser\Stream\Stream; readonly class CrossReferenceSubSection { /** @var array */ @@ -51,4 +52,24 @@ public function getCrossReferenceEntry(int $objNumber): CrossReferenceEntryInUse return $object; } + + public function hasInvalidByteOffset(Stream $stream): bool { + foreach ($this->crossReferenceEntries as $index => $crossReferenceEntry) { + if ($crossReferenceEntry instanceof CrossReferenceEntryInUseObject === false) { + continue; + } + + if ($crossReferenceEntry->byteOffsetInDecodedStream > $stream->getSizeInBytes()) { + return true; + } + + $objNumber = $this->firstObjectNumber + $index; + $expectedObjMarker = $objNumber . ' ' . $crossReferenceEntry->generationNumber . ' obj'; + if ($stream->read($crossReferenceEntry->byteOffsetInDecodedStream, strlen($expectedObjMarker)) !== $expectedObjMarker) { + return true; + } + } + + return false; + } } diff --git a/tests/Unit/Document/CrossReference/RawStream/ObjectPositionsFromRawStreamParserTest.php b/tests/Unit/Document/CrossReference/RawStream/ObjectPositionsFromRawStreamParserTest.php new file mode 100644 index 00000000..bc25dccd --- /dev/null +++ b/tests/Unit/Document/CrossReference/RawStream/ObjectPositionsFromRawStreamParserTest.php @@ -0,0 +1,34 @@ + 1, + 42 => 1232131, + ], + ObjectPositionsFromRawStreamParser::parse( + new InMemoryStream( + <<