diff --git a/samples/bugs/PullRequestDuplicateKids.pdf b/samples/bugs/PullRequestDuplicateKids.pdf new file mode 100644 index 00000000..e69a85cc Binary files /dev/null and b/samples/bugs/PullRequestDuplicateKids.pdf differ diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php index 1fad8b1b..bcd1716b 100644 --- a/src/Smalot/PdfParser/Document.php +++ b/src/Smalot/PdfParser/Document.php @@ -401,7 +401,7 @@ public function getPages() /** @var Pages $object */ $object = $catalogue->get('Pages'); if (method_exists($object, 'getPages')) { - return $object->getPages(true); + return $this->uniquePages($object->getPages(true)); } } @@ -415,19 +415,48 @@ public function getPages() $pages = array_merge($pages, $object->getPages(true)); } - return $pages; + return $this->uniquePages($pages); } if ($this->hasObjectsByType('Page')) { // Search for 'page' (unordered pages). $pages = $this->getObjectsByType('Page'); - return array_values($pages); + return $this->uniquePages(array_values($pages)); } throw new MissingCatalogException('Missing catalog.'); } + /** + * @param array $pages + * + * @return array + */ + protected function uniquePages(array $pages): array + { + $unique = []; + $seen = []; + + foreach ($pages as $page) { + if (!\is_object($page)) { + continue; + } + + $id = \function_exists('spl_object_id') + ? (string) \spl_object_id($page) + : \spl_object_hash($page); + if (isset($seen[$id])) { + continue; + } + + $seen[$id] = true; + $unique[] = $page; + } + + return $unique; + } + public function getText(?int $pageLimit = null): string { $texts = []; diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php index 346ba633..2efcc665 100644 --- a/tests/PHPUnit/Integration/DocumentTest.php +++ b/tests/PHPUnit/Integration/DocumentTest.php @@ -38,6 +38,7 @@ use PHPUnitTests\TestCase; use Smalot\PdfParser\Document; use Smalot\PdfParser\Header; +use Smalot\PdfParser\Parser; use Smalot\PdfParser\Page; use Smalot\PdfParser\Pages; use Smalot\PdfParser\PDFObject; @@ -233,6 +234,44 @@ public function testGetPagesMissingCatalog(): void $document->getPages(); } + public function testGetPagesDeduplicatesDuplicateKidsReferences(): void + { + $document = $this->getDocumentInstance(); + + $content = '<>'; + $header = Header::parse($content, $document); + $page = $this->getPageInstance($document, $header); + + $content = '<>'; + $header = Header::parse($content, $document); + $pagesNode = $this->getPagesInstance($document, $header); + + $content = '<>'; + $header = Header::parse($content, $document); + $catalog = $this->getPDFObjectInstance($document, $header); + + $document->setObjects([ + '10_0' => $page, + '20_0' => $pagesNode, + '30_0' => $catalog, + ]); + + $pages = $document->getPages(); + + $this->assertCount(1, $pages); + $this->assertSame($page, $pages[0]); + } + + /** + * @see https://github.com/smalot/pdfparser/pull/795 + */ + public function testGetPagesDeduplicatesDuplicateKidsFixture(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestDuplicateKids.pdf'); + + self::assertCount(1, $document->getPages()); + } + /** * @see https://github.com/smalot/pdfparser/issues/721 */