diff --git a/samples/bugs/PullRequest797-pdf.js.pdf b/samples/bugs/PullRequest797-pdf.js.pdf new file mode 100644 index 00000000..f3e25216 Binary files /dev/null and b/samples/bugs/PullRequest797-pdf.js.pdf differ diff --git a/samples/bugs/PullRequest797-vera.pdf b/samples/bugs/PullRequest797-vera.pdf new file mode 100644 index 00000000..71855760 Binary files /dev/null and b/samples/bugs/PullRequest797-vera.pdf differ diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ec8d01e5..7268743b 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -198,16 +198,16 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [], // get only the last updated version $xref['trailer'] = []; // parse trailer_data - if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { $xref['trailer']['size'] = (int) $matches[1]; } - if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2]; } - if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2]; } - if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2]; } if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) { @@ -216,7 +216,7 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [], $xref['trailer']['id'][1] = $matches[2]; } } - if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { $offset = (int) $matches[1]; if (0 != $offset) { // get previous xref @@ -922,14 +922,18 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ throw new \Exception('Unable to find xref (PDF corrupted?)'); } + // Some files point startxref to the whitespace right before the xref keyword. + $startxrefOffset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref); + // check xref position - if (strpos($pdfData, 'xref', $startxref) == $startxref) { + if (strpos($pdfData, 'xref', $startxrefOffset) == $startxrefOffset) { // Cross-Reference - $xref = $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets); + $xref = $this->decodeXref($pdfData, $startxrefOffset, $xref, $visitedOffsets); } else { // Check if the $pdfData might have the wrong line-endings $pdfDataUnix = str_replace("\r\n", "\n", $pdfData); - if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) { + $startxrefUnixOffset = $startxref + strspn($pdfDataUnix, $this->config->getPdfWhitespaces(), $startxref); + if ($startxrefUnixOffset < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxrefUnixOffset) == $startxrefUnixOffset) { // Return Unix-line-ending flag $xref = ['Unix' => true]; } else { diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 7c7fe7e6..1b40f8bf 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -36,7 +36,6 @@ namespace PHPUnitTests\Integration; use PHPUnitTests\TestCase; -use Smalot\PdfParser\Document; use Smalot\PdfParser\Parser; /** @@ -111,4 +110,21 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } + + public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInVeraPdfFixture(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-vera.pdf'); + + self::assertCount(1, $document->getPages()); + } + + /** + * @see https://github.com/smalot/pdfparser/pull/797 + */ + public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-pdf.js.pdf'); + + self::assertCount(1, $document->getPages()); + } }