diff --git a/samples/bugs/rawdata/PullRequest813-pdf.js.pdf b/samples/bugs/rawdata/PullRequest813-pdf.js.pdf new file mode 100644 index 00000000..d0457b26 Binary files /dev/null and b/samples/bugs/rawdata/PullRequest813-pdf.js.pdf differ diff --git a/samples/bugs/rawdata/PullRequest814-pdf.js.pdf b/samples/bugs/rawdata/PullRequest814-pdf.js.pdf new file mode 100644 index 00000000..c52cde32 Binary files /dev/null and b/samples/bugs/rawdata/PullRequest814-pdf.js.pdf differ diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index e901179e..79404f12 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -267,6 +267,17 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref { // try to read Cross-Reference Stream $xrefobj = $this->getRawObject($pdfData, $startxref); + if (!isset($xrefobj[0], $xrefobj[1]) || 'objref' !== $xrefobj[0] || !\is_string($xrefobj[1])) { + // Some malformed files point startxref inside the trailer of a classic xref section. + $searchStart = max(0, $startxref - 32); + $relativeXrefOffset = strrpos(substr($pdfData, $searchStart, ($startxref - $searchStart) + 1), 'xref'); + if (false !== $relativeXrefOffset) { + $recoveredXrefOffset = $searchStart + $relativeXrefOffset; + + return $this->decodeXref($pdfData, $recoveredXrefOffset, $xref, $visitedOffsets); + } + } + $xrefObjRef = isset($xrefobj[1]) && \is_string($xrefobj[1]) ? $xrefobj[1] : ''; $xrefObjOffset = $startxref; @@ -598,7 +609,7 @@ private function mergeMissingXrefOffsetsFromObjectHeaders(string $pdfData, array if ( preg_match_all( - '/(?:^|[\r\n])([0-9]+)[\x09\x0a\x0c\x0d\x20]+([0-9]+)[\x09\x0a\x0c\x0d\x20]+obj(?=[\x09\x0a\x0c\x0d\x20<])/i', + '/(?:^|[\r\n])(?:%[\x09\x0a\x0c\x0d\x20]*)?([0-9]+)[\x09\x0a\x0c\x0d\x20]+([0-9]+)[\x09\x0a\x0c\x0d\x20]+obj(?=[\x09\x0a\x0c\x0d\x20<])/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE @@ -612,6 +623,11 @@ private function mergeMissingXrefOffsetsFromObjectHeaders(string $pdfData, array if (!isset($xref['xref'][$objRef])) { $xref['xref'][$objRef] = $offset; + } else { + $currentOffset = (int) $xref['xref'][$objRef]; + if (!$this->isXrefOffsetUsableForObjectRef($pdfData, $objRef, $currentOffset)) { + $xref['xref'][$objRef] = $offset; + } } } } @@ -619,6 +635,35 @@ private function mergeMissingXrefOffsetsFromObjectHeaders(string $pdfData, array return $xref; } + private function isXrefOffsetUsableForObjectRef(string $pdfData, string $objRef, int $offset): bool + { + if ($offset < 0) { + return false; + } + + $objRefArr = explode('_', $objRef); + if (2 !== \count($objRefArr)) { + return false; + } + + $objHeaderPattern = $this->getObjectHeaderPattern($objRefArr); + + // Check exact offset first (ignoring leading whitespace/zeros). + $candidateOffset = $offset; + $candidateOffset += strspn($pdfData, $this->config->getPdfWhitespaces(), $candidateOffset); + $candidateOffset += strspn($pdfData, '0', $candidateOffset); + if (preg_match($objHeaderPattern, substr($pdfData, $candidateOffset, 64)) > 0) { + return true; + } + + // Accept small xref inaccuracies where header is nearby. + $searchStart = max(0, $offset - 128); + return preg_match( + $objHeaderPattern, + substr($pdfData, $searchStart, 256) + ) > 0; + } + /** * Get content of indirect object. * @@ -1324,10 +1369,14 @@ public function parseData(string $data): array } $rootObjectRef = $xref['trailer']['root'] ?? null; - if (\is_string($rootObjectRef) && !isset($xref['xref'][$rootObjectRef])) { + $trailerSize = isset($xref['trailer']['size']) ? (int) $xref['trailer']['size'] : 0; + $xrefEntryCount = isset($xref['xref']) && \is_array($xref['xref']) ? \count($xref['xref']) : 0; + if ( + (\is_string($rootObjectRef) && !isset($xref['xref'][$rootObjectRef])) + || ($trailerSize > 0 && $xrefEntryCount > 0 && $xrefEntryCount < $trailerSize) + ) { $xref = $this->mergeMissingXrefOffsetsFromObjectHeaders($pdfData, $xref); } - // parse all document objects $objects = []; foreach ($xref['xref'] as $obj => $offset) { diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index 18d5e227..f7e870f3 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -396,4 +396,18 @@ public function testMalformedXrefStreamMissingRootEntryStillParsesPage(): void self::assertCount(1, $document->getPages()); } + + public function testRecoverPagesWhenXrefEntriesArePartiallyMissing(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/rawdata/PullRequest813-pdf.js.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testRecoverPagesWhenRootOffsetPointsToInvalidObject(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/rawdata/PullRequest814-pdf.js.pdf'); + + self::assertCount(1, $document->getPages()); + } }