From e57c28e98944ed62b78f388d9d9f3850582afc4b Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Thu, 23 Apr 2026 22:41:50 -0300 Subject: [PATCH 1/4] fix: deduplicate duplicate kids references in getPages Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- src/Smalot/PdfParser/Document.php | 33 ++++++++++++++++++++-- tests/PHPUnit/Integration/DocumentTest.php | 28 ++++++++++++++++++ 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php index 1fad8b1ba..80d6f2197 100644 --- a/src/Smalot/PdfParser/Document.php +++ b/src/Smalot/PdfParser/Document.php @@ -401,7 +401,7 @@ public function getPages() /** @var Pages $object */ $object = $catalogue->get('Pages'); if (method_exists($object, 'getPages')) { - return $object->getPages(true); + return $this->uniquePages($object->getPages(true)); } } @@ -415,19 +415,46 @@ public function getPages() $pages = array_merge($pages, $object->getPages(true)); } - return $pages; + return $this->uniquePages($pages); } if ($this->hasObjectsByType('Page')) { // Search for 'page' (unordered pages). $pages = $this->getObjectsByType('Page'); - return array_values($pages); + return $this->uniquePages(array_values($pages)); } throw new MissingCatalogException('Missing catalog.'); } + /** + * @param array $pages + * + * @return array + */ + protected function uniquePages(array $pages): array + { + $unique = []; + $seen = []; + + foreach ($pages as $page) { + if (!\is_object($page)) { + continue; + } + + $id = spl_object_id($page); + if (isset($seen[$id])) { + continue; + } + + $seen[$id] = true; + $unique[] = $page; + } + + return $unique; + } + public function getText(?int $pageLimit = null): string { $texts = []; diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php index 346ba6331..2a28b96bb 100644 --- a/tests/PHPUnit/Integration/DocumentTest.php +++ b/tests/PHPUnit/Integration/DocumentTest.php @@ -233,6 +233,34 @@ public function testGetPagesMissingCatalog(): void $document->getPages(); } + public function testGetPagesDeduplicatesDuplicateKidsReferences(): void + { + $document = $this->getDocumentInstance(); + + $content = '<>'; + $header = Header::parse($content, $document); + $page = $this->getPageInstance($document, $header); + + $content = '<>'; + $header = Header::parse($content, $document); + $pagesNode = $this->getPagesInstance($document, $header); + + $content = '<>'; + $header = Header::parse($content, $document); + $catalog = $this->getPDFObjectInstance($document, $header); + + $document->setObjects([ + '10_0' => $page, + '20_0' => $pagesNode, + '30_0' => $catalog, + ]); + + $pages = $document->getPages(); + + $this->assertCount(1, $pages); + $this->assertSame($page, $pages[0]); + } + /** * @see https://github.com/smalot/pdfparser/issues/721 */ From 0f84b753040d43d0b7fa1cf8660014a27b8f43e2 Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Thu, 23 Apr 2026 22:45:34 -0300 Subject: [PATCH 2/4] test: add duplicate-kids PDF fixture regression Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- samples/bugs/PullRequestDuplicateKids.pdf | Bin 0 -> 437 bytes tests/PHPUnit/Integration/DocumentTest.php | 10 ++++++++++ 2 files changed, 10 insertions(+) create mode 100644 samples/bugs/PullRequestDuplicateKids.pdf diff --git a/samples/bugs/PullRequestDuplicateKids.pdf b/samples/bugs/PullRequestDuplicateKids.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e69a85cc51ff6192dda85df6f1f783f82bf59f90 GIT binary patch literal 437 zcmZXRO-}1zoAdB&7Ld{GCLgiw?bIJ&sfgE#C+BmCum}~~$Ofa|hc%T+_1E=@ zPX>N{9?*5^jRYzqrhs-Q@@tOUTkfMF=GQ87_=MUTX2JBuFtemQ(N;m p;j}h`mV)iK7`W@t{TG4(*?6BIe+PFvCL0phq^~Mb6tkzf_y&#taGU@D literal 0 HcmV?d00001 diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php index 2a28b96bb..246bed875 100644 --- a/tests/PHPUnit/Integration/DocumentTest.php +++ b/tests/PHPUnit/Integration/DocumentTest.php @@ -261,6 +261,16 @@ public function testGetPagesDeduplicatesDuplicateKidsReferences(): void $this->assertSame($page, $pages[0]); } + /** + * @see https://github.com/smalot/pdfparser/pull/795 + */ + public function testGetPagesDeduplicatesDuplicateKidsFixture(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestDuplicateKids.pdf'); + + self::assertCount(1, $document->getPages()); + } + /** * @see https://github.com/smalot/pdfparser/issues/721 */ From 36e7d7eb5eab87dee25265e729da2811053434fc Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Thu, 23 Apr 2026 22:51:24 -0300 Subject: [PATCH 3/4] fix: support php 7.1 in page deduplication Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- src/Smalot/PdfParser/Document.php | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php index 80d6f2197..bcd1716bd 100644 --- a/src/Smalot/PdfParser/Document.php +++ b/src/Smalot/PdfParser/Document.php @@ -443,7 +443,9 @@ protected function uniquePages(array $pages): array continue; } - $id = spl_object_id($page); + $id = \function_exists('spl_object_id') + ? (string) \spl_object_id($page) + : \spl_object_hash($page); if (isset($seen[$id])) { continue; } From f9f6aa362cf36fe457990d2a378c5d424457b42d Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Sat, 25 Apr 2026 20:07:35 -0300 Subject: [PATCH 4/4] test(document): import Parser for duplicate-kids fixture regression --- tests/PHPUnit/Integration/DocumentTest.php | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php index 246bed875..2efcc6655 100644 --- a/tests/PHPUnit/Integration/DocumentTest.php +++ b/tests/PHPUnit/Integration/DocumentTest.php @@ -38,6 +38,7 @@ use PHPUnitTests\TestCase; use Smalot\PdfParser\Document; use Smalot\PdfParser\Header; +use Smalot\PdfParser\Parser; use Smalot\PdfParser\Page; use Smalot\PdfParser\Pages; use Smalot\PdfParser\PDFObject;