diff --git a/samples/bugs/PullRequest794.pdf b/samples/bugs/PullRequest794.pdf new file mode 100644 index 00000000..71855760 Binary files /dev/null and b/samples/bugs/PullRequest794.pdf differ diff --git a/samples/bugs/PullRequestStartxrefWhitespaceXrefStream.pdf b/samples/bugs/PullRequestStartxrefWhitespaceXrefStream.pdf new file mode 100644 index 00000000..9d15f247 Binary files /dev/null and b/samples/bugs/PullRequestStartxrefWhitespaceXrefStream.pdf differ diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ec8d01e5..746dcd46 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -922,19 +922,27 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ throw new \Exception('Unable to find xref (PDF corrupted?)'); } + // Some files point startxref to the whitespace right before the xref keyword or stream object. + $startxrefOffset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref); + // Be tolerant if startxref points one byte into the xref keyword ("ref"). + if ($startxrefOffset > 0 && strpos($pdfData, 'xref', $startxrefOffset - 1) == $startxrefOffset - 1) { + --$startxrefOffset; + } + // check xref position - if (strpos($pdfData, 'xref', $startxref) == $startxref) { + if (strpos($pdfData, 'xref', $startxrefOffset) == $startxrefOffset) { // Cross-Reference - $xref = $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets); + $xref = $this->decodeXref($pdfData, $startxrefOffset, $xref, $visitedOffsets); } else { // Check if the $pdfData might have the wrong line-endings $pdfDataUnix = str_replace("\r\n", "\n", $pdfData); - if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) { + $startxrefUnixOffset = $startxref + strspn($pdfDataUnix, $this->config->getPdfWhitespaces(), $startxref); + if ($startxrefUnixOffset < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxrefUnixOffset) == $startxrefUnixOffset) { // Return Unix-line-ending flag $xref = ['Unix' => true]; } else { // Cross-Reference Stream - $xref = $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets); + $xref = $this->decodeXrefStream($pdfData, $startxrefOffset, $xref, $visitedOffsets); } } if (empty($xref)) { diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 7c7fe7e6..a43d9412 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -111,4 +111,18 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } + + public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInXrefStream(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestStartxrefWhitespaceXrefStream.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWhenStartxrefPointsNearXrefKeyword(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest794.pdf'); + + self::assertCount(1, $document->getPages()); + } }