diff --git a/samples/bugs/PullRequestNearbyObjectHeaderOffset.pdf b/samples/bugs/PullRequestNearbyObjectHeaderOffset.pdf new file mode 100644 index 00000000..950fb8f5 Binary files /dev/null and b/samples/bugs/PullRequestNearbyObjectHeaderOffset.pdf differ diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ec8d01e5..4dad74d7 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -513,7 +513,7 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref protected function getObjectHeaderPattern(array $objRefs): string { // consider all whitespace character (PDF specifications) - return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/'; + return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().'+'.$objRefs[1].$this->config->getPdfWhitespacesRegex().'+obj/'; } protected function getObjectHeaderLen(array $objRefs): int @@ -546,6 +546,7 @@ protected function getIndirectObject(string $pdfData, array $xref, string $objRe throw new \Exception('Invalid object reference for $obj.'); } + $objHeaderPattern = $this->getObjectHeaderPattern($objRefArr); $objHeaderLen = $this->getObjectHeaderLen($objRefArr); /* @@ -555,9 +556,27 @@ protected function getIndirectObject(string $pdfData, array $xref, string $objRe $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset); // ignore leading zeros for object number $offset += strspn($pdfData, '0', $offset); - if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) { - // an indirect reference to an undefined object shall be considered a reference to the null object - return ['null', 'null', $offset]; + if (0 == preg_match($objHeaderPattern, substr($pdfData, $offset, 33), $headerMatches)) { + // Some malformed files have slightly inaccurate xref offsets. + // Try to recover by locating the expected object header nearby. + $searchStart = max(0, $offset - 64); + $searchLen = 192; + if ( + preg_match( + $objHeaderPattern, + substr($pdfData, $searchStart, $searchLen), + $headerMatches, + \PREG_OFFSET_CAPTURE + ) > 0 + ) { + $offset = $searchStart + $headerMatches[0][1]; + $objHeaderLen = \strlen($headerMatches[0][0]); + } else { + // an indirect reference to an undefined object shall be considered a reference to the null object + return ['null', 'null', $offset]; + } + } else { + $objHeaderLen = \strlen($headerMatches[0]); } /* diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 7c7fe7e6..843513b5 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -111,4 +111,11 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } + + public function testParseFileWhenObjectHeaderIsNearXrefOffset(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestNearbyObjectHeaderOffset.pdf'); + + self::assertCount(1, $document->getPages()); + } }