diff --git a/samples/bugs/PullRequest797-pdf.js.pdf b/samples/bugs/rawdata/PullRequest797-pdf.js.pdf similarity index 100% rename from samples/bugs/PullRequest797-pdf.js.pdf rename to samples/bugs/rawdata/PullRequest797-pdf.js.pdf diff --git a/samples/bugs/PullRequest797-vera.pdf b/samples/bugs/rawdata/PullRequest797-vera.pdf similarity index 100% rename from samples/bugs/PullRequest797-vera.pdf rename to samples/bugs/rawdata/PullRequest797-vera.pdf diff --git a/samples/bugs/PullRequestInvalidObjectReference.pdf b/samples/bugs/rawdata/PullRequestInvalidObjectReference.pdf similarity index 100% rename from samples/bugs/PullRequestInvalidObjectReference.pdf rename to samples/bugs/rawdata/PullRequestInvalidObjectReference.pdf diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 411b1951c..bebe23f6e 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -947,8 +947,12 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ throw new \Exception('Unable to find xref (PDF corrupted?)'); } - // Some files point startxref to the whitespace right before the xref keyword. + // Some files point startxref to the whitespace right before the xref keyword or stream object. $startxrefOffset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref); + // Be tolerant if startxref points one byte into the xref keyword ("ref"). + if ($startxrefOffset > 0 && strpos($pdfData, 'xref', $startxrefOffset - 1) == $startxrefOffset - 1) { + --$startxrefOffset; + } // check xref position if (strpos($pdfData, 'xref', $startxrefOffset) == $startxrefOffset) { @@ -963,7 +967,7 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ $xref = ['Unix' => true]; } else { // Cross-Reference Stream - $xref = $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets); + $xref = $this->decodeXrefStream($pdfData, $startxrefOffset, $xref, $visitedOffsets); } } if (empty($xref)) { diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index f9aa6d5cf..947349336 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -318,33 +318,27 @@ public function testGetXrefDataTracksVisitedOffsets(): void } /** - * Ensure parser resolves compressed object references from xref streams. - * - * @see https://github.com/smalot/pdfparser/pull/796 - */ - public function testParseFileWithCompressedObjRefInXrefStream(): void - { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestInvalidObjectReference.pdf'); - - self::assertCount(1, $document->getPages()); - } - - /** - * @see https://github.com/smalot/pdfparser/pull/797 + * @return iterable */ - public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInVeraPdfFixture(): void + public static function provideRawDataRegressionFixtures(): iterable { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-vera.pdf'); - - self::assertCount(1, $document->getPages()); + yield 'pr796 invalid-object-reference / pr798 startxref-whitespace equivalent' => [ + 'rawdata/PullRequestInvalidObjectReference.pdf', + ]; + yield 'pr797 vera / pr798 pullrequest794 equivalent' => [ + 'rawdata/PullRequest797-vera.pdf', + ]; + yield 'pr797 pdf.js xref stream fixture' => [ + 'rawdata/PullRequest797-pdf.js.pdf', + ]; } /** - * @see https://github.com/smalot/pdfparser/pull/797 + * @dataProvider provideRawDataRegressionFixtures */ - public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void + public function testParseFileWithRawDataRegressionFixture(string $fixturePath): void { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-pdf.js.pdf'); + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/'.$fixturePath); self::assertCount(1, $document->getPages()); }