Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
75 commits
Select commit Hold shift + click to select a range
f195d27
fix: deduplicate duplicate kids references in getPages
vitormattos Apr 24, 2026
a617540
test: add duplicate-kids PDF fixture regression
vitormattos Apr 24, 2026
ace7d51
test: add @see link for duplicate-kids regression
vitormattos Apr 24, 2026
bbbd1d3
fix: support php 7.1 in page deduplication
vitormattos Apr 24, 2026
e5eae4e
fix: preserve absolute xref offsets with pre-header bytes
vitormattos Apr 24, 2026
917ad5d
test: use assertCount for page count assertion
vitormattos Apr 24, 2026
1ae0081
fix: guard cyclic page tree traversal
vitormattos Apr 24, 2026
e24b1c2
fix: recover repeated page refs in cyclic page trees
vitormattos Apr 25, 2026
7bf64b8
fix: recover pages when xref entries are partially missing
vitormattos Apr 25, 2026
4996a8f
fix: recover root object when xref points to invalid offset
vitormattos Apr 25, 2026
e2f33e3
tests: trim PR812 scope in DocumentIssueFocusTest
vitormattos Apr 25, 2026
70361ca
fix: recover when startxref points into xref trailer
vitormattos Apr 25, 2026
b8ec7b3
test: move PR796 regression to RawDataParserTest
vitormattos Apr 25, 2026
66b4daf
fix: allow startxref offset to include leading whitespace
vitormattos Apr 24, 2026
edbacca
test: add pdf.js compressed xref regression
vitormattos Apr 24, 2026
cc85357
test: clarify pull request fixture provenance
vitormattos Apr 24, 2026
cbd0bbf
test(rawdata): keep PR796/797 regressions in RawDataParserTest only
vitormattos Apr 25, 2026
0692f90
test(pages): keep cyclic pages regression in PagesTest
vitormattos Apr 25, 2026
e1e08e9
style(tests): fix import order in PagesTest
vitormattos Apr 25, 2026
9629034
fix(rawdata): recover malformed xref/startxref scenarios from PR809 s…
vitormattos Apr 26, 2026
6815ca8
fix(memory): guard flate decoding and add memory limit helper
vitormattos Apr 26, 2026
b0471aa
test(pages): align cyclic pages expectation with dedup behavior
vitormattos Apr 26, 2026
d22ae73
test(pages): fix PR806 standalone cyclic pages expectation
vitormattos Apr 26, 2026
181268f
test(pages): make cyclic pages assertion merge-safe
vitormattos Apr 26, 2026
1abea5a
test(page): drop PR806 fixture regression from PR814 scope
vitormattos Apr 26, 2026
dc9ee90
fix(rawdata): remove MemoryLimit dependency from PR813 base
vitormattos Apr 26, 2026
121b545
test(pages): add fixture source @see for PR806
vitormattos Apr 26, 2026
1f71566
test(rawdata): add fixture source @see links for PR796
vitormattos Apr 26, 2026
15a9e5b
test(rawdata): annotate fixture origins with @see links in PR813
vitormattos Apr 26, 2026
d3175a3
test(document): add fixture @see for PR795 regression
vitormattos Apr 26, 2026
4376fce
style(test): fix @see indentation in DocumentIssueFocusTest
vitormattos Apr 26, 2026
0cb2995
style(test): fix @see indentation in RawDataParserTest
vitormattos Apr 26, 2026
1e076a0
style(test): fix @see indentation in rawdata fixture docs
vitormattos Apr 26, 2026
6e3695c
fix(rawdata): recover xref_command_missing in PR796 stack
vitormattos Apr 27, 2026
8db57cd
fix(rawdata): consolidate invalid offset and object-reference recovery
vitormattos Apr 27, 2026
c0aeaf2
fix(rawdata): keep consolidated recovery flow
vitormattos Apr 27, 2026
41c91d8
fix(rawdata): recover malformed xref/startxref scenarios from PR809 s…
vitormattos Apr 26, 2026
6460f6f
fix(rawdata): remove MemoryLimit dependency from PR813 base
vitormattos Apr 26, 2026
5a96c1f
test(rawdata): annotate fixture origins with @see links in PR813
vitormattos Apr 26, 2026
f57f179
style(test): fix @see indentation in rawdata fixture docs
vitormattos Apr 26, 2026
309841d
fix(rawdata): recover malformed xref trailers and page trees
vitormattos Apr 27, 2026
ccff792
fix(rawdata): return early for visited xref offsets
vitormattos Apr 27, 2026
2c69ed9
fix(rawdata): consolidate recovery fixtures and parser tolerance
vitormattos Apr 27, 2026
a7cca15
test(rawdata): move recoverable catalog fixtures out of issue-focus s…
vitormattos Apr 28, 2026
45994db
test(rawdata): add per-fixture @see links in data providers
vitormattos Apr 28, 2026
3da5a3c
test(rawdata): add @see per entry in regression dataprovider
vitormattos Apr 28, 2026
9bd1ec6
test(rawdata): add missing @see to PR regression tests
vitormattos Apr 28, 2026
2412eb7
test(rawdata): use external source PDF links in @see
vitormattos Apr 28, 2026
213c627
test(memory): move large flate regression out of DocumentIssueFocusTest
vitormattos Apr 28, 2026
fb3a363
test(document): keep duplicate-kids fixture coverage in DocumentTest
vitormattos Apr 28, 2026
f22f043
test(memory): restore DocumentIssueFocusTest to master baseline
vitormattos Apr 28, 2026
b184d8d
style(test): order DocumentTest imports for php-cs-fixer
vitormattos Apr 28, 2026
c2c023b
test(document): replace circular @see with synthetic fixture provenance
vitormattos Apr 28, 2026
1aa434a
docs(test): drop unnecessary hashes for synthetic fixture
vitormattos Apr 28, 2026
76afa31
docs(test): trim synthetic fixture docblock to essential note
vitormattos Apr 28, 2026
a27049d
fix(pages): normalize Kids in collectPages traversal
vitormattos Apr 28, 2026
da300de
fix(rawdata): tolerate malformed prev xref chain and add REDHAT regre…
vitormattos Apr 28, 2026
0720cda
chore: drop internal diagnose-parser tool from public PR
vitormattos Apr 28, 2026
a51f137
fix(rawdata-next): align conflict hotspots with integration-resolved …
vitormattos Apr 29, 2026
7b96814
refactor(rawdata-next): delegate shared parser/rawdata test ownership…
vitormattos Apr 29, 2026
ab9a4ef
refactor(rawdata-next): minimize overlap with pages-tree and memory-g…
vitormattos Apr 29, 2026
0600fd2
fix(tests): align rawdata fixture paths in PR816
vitormattos Apr 29, 2026
27e1186
fix(rawdata): restore xref and objref recovery logic for PR816
vitormattos Apr 29, 2026
d0017ca
docs(tests): keep only external PDF @see links in PR816 rawdata tests
vitormattos Apr 29, 2026
e5c71b7
refactor(pr817): isolate non-overlapping fixture scope
vitormattos Apr 29, 2026
77d435b
fix(pages): recover malformed page-like kids
vitormattos Apr 29, 2026
ced72e9
fix(rawdata): tolerate recoverable headerless inputs
vitormattos Apr 29, 2026
c624613
Merge PR #795 into PR809 recreation 20260429
vitormattos Apr 29, 2026
1eecc75
Merge PR #806 into PR809 recreation 20260429
vitormattos Apr 29, 2026
2cc6c1d
Merge PR #812 into PR809 recreation 20260429
vitormattos Apr 29, 2026
b08699b
Merge PR #813 into PR809 recreation 20260429
vitormattos Apr 29, 2026
b416f17
Merge PR #816 into PR809 recreation 20260429
vitormattos Apr 29, 2026
e86f180
Merge rawdata recovery stack into PR809 recreation 20260429
vitormattos Apr 29, 2026
ad64bdc
Merge PR #817 into PR809 recreation 20260429
vitormattos Apr 29, 2026
16f2e3f
fix(pages): remove duplicated declared-count recovery method
vitormattos Apr 29, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Auto detect text files and perform LF normalization
* text=auto

# Treat PDF files as binary to prevent CRLF conversion on Windows
*.pdf binary

/.editorconfig export-ignore
/.gitattributes export-ignore
/.gitignore export-ignore
Expand Down
Binary file added samples/bugs/Brotli-Prototype-FileA.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest797-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest797-vera.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest806-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest813-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest814-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest815-xref-command-missing.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequestDuplicateKids.pdf
Binary file not shown.
Binary file not shown.
Binary file added samples/bugs/issue15590.pdf
Binary file not shown.
Binary file added samples/bugs/issue9105_other.pdf
Binary file not shown.
Binary file added samples/bugs/poppler-85140-0.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest794.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest797-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest797-vera.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest804-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest805-pdf.js.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest809-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest812-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest813-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest814-pdf.js.pdf
Binary file not shown.
Binary file not shown.
Binary file added samples/bugs/rawdata/PullRequest818-pdf.js.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added samples/bugs/rawdata/bug1250079.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/bug1539074.1.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/bug1539074.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/bug1606566.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/bug1795263.pdf
Binary file not shown.
Binary file not shown.
Binary file added samples/bugs/rawdata/pdfjs-issue19517.pdf
Binary file not shown.
Binary file added samples/bugs/rawdata/poppler-742-0-fuzzed.pdf
Binary file not shown.
277 changes: 273 additions & 4 deletions src/Smalot/PdfParser/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@

namespace Smalot\PdfParser;

use Smalot\PdfParser\Element\ElementMissing;
use Smalot\PdfParser\Element\ElementName;
use Smalot\PdfParser\Element\ElementNumeric;
use Smalot\PdfParser\Encoding\PDFDocEncoding;
use Smalot\PdfParser\Exception\MissingCatalogException;

Expand Down Expand Up @@ -393,6 +396,10 @@ public function getFirstFont(): ?Font
*/
public function getPages()
{
if (!$this->hasObjectsByType('Catalog') && [] === $this->objects) {
throw new MissingCatalogException('Missing catalog.');
}

if ($this->hasObjectsByType('Catalog')) {
// Search for catalog to list pages.
$catalogues = $this->getObjectsByType('Catalog');
Expand All @@ -401,7 +408,10 @@ public function getPages()
/** @var Pages $object */
$object = $catalogue->get('Pages');
if (method_exists($object, 'getPages')) {
return $object->getPages(true);
$pages = $object->getPages(true);
if ([] !== $pages) {
return $this->getUniquePages($pages);
}
}
}

Expand All @@ -415,17 +425,276 @@ public function getPages()
$pages = array_merge($pages, $object->getPages(true));
}

return $pages;
if ([] !== $pages) {
return $this->getUniquePages($pages);
}
}

if ($this->hasObjectsByType('Page')) {
// Search for 'page' (unordered pages).
$pages = $this->getObjectsByType('Page');

return array_values($pages);
return $this->getUniquePages(array_values($pages));
}

// Last-resort recovery for malformed files where /Type key is corrupted
// but the object still carries page-like structure markers.
$recoveredPages = $this->getRecoveredPagesFromMalformedHeaders();
if ([] !== $recoveredPages) {
return $this->getUniquePages($recoveredPages);
}

$encryptedFallbackPages = $this->getEncryptedCatalogFallbackPages();
if ([] !== $encryptedFallbackPages) {
return $this->getUniquePages($encryptedFallbackPages);
}

$xrefRootMissingFallbackPages = $this->getXrefRootMissingFallbackPages();
if ([] !== $xrefRootMissingFallbackPages) {
return $this->getUniquePages($xrefRootMissingFallbackPages);
}

$catalogMissingPagesFallbackPages = $this->getCatalogMissingPagesFallbackPages();
if ([] !== $catalogMissingPagesFallbackPages) {
return $this->getUniquePages($catalogMissingPagesFallbackPages);
}

$catalogUnresolvablePagesFallbackPages = $this->getCatalogUnresolvablePagesFallbackPages();
if ([] !== $catalogUnresolvablePagesFallbackPages) {
return $this->getUniquePages($catalogUnresolvablePagesFallbackPages);
}

$brokenPagesTreeFallbackPages = $this->getBrokenPagesTreeFallbackPages();
if ([] !== $brokenPagesTreeFallbackPages) {
return $this->getUniquePages($brokenPagesTreeFallbackPages);
}

$minimalHeaderlessStructureFallbackPages = $this->getMinimalHeaderlessStructureFallbackPages();
if ([] !== $minimalHeaderlessStructureFallbackPages) {
return $this->getUniquePages($minimalHeaderlessStructureFallbackPages);
}

// Gracefully handle irrecoverable malformed PDFs by returning no pages.
return [];
}

/**
* @param array<Page> $pages
*
* @return array<Page>
*/
protected function getUniquePages(array $pages): array
{
$seen = [];
$uniquePages = [];

foreach ($pages as $page) {
$key = \function_exists('spl_object_id')
? (string) \spl_object_id($page)
: \spl_object_hash($page);

if (isset($seen[$key])) {
continue;
}

$seen[$key] = true;
$uniquePages[] = $page;
}

return $uniquePages;
}

/**
* @return array<Page>
*/
protected function getRecoveredPagesFromMalformedHeaders(): array
{
$pages = [];

foreach ($this->objects as $object) {
$header = $object->getHeader();
if (null === $header) {
continue;
}

$parent = $header->get('Parent');
$mediaBox = $header->get('MediaBox');
if ($parent instanceof ElementMissing || $mediaBox instanceof ElementMissing) {
continue;
}

if (!$this->headerContainsPageMarker($header)) {
continue;
}

$pages[] = new Page($this, $header, null);
}

return $pages;
}

/**
* @return array<Page>
*/
protected function getEncryptedCatalogFallbackPages(): array
{
if (!$this->trailer->has('Encrypt') || !$this->hasObjectsByType('Catalog')) {
return [];
}

$catalogues = $this->getObjectsByType('Catalog');
$catalogue = reset($catalogues);
if (false === $catalogue) {
return [];
}

$pages = $catalogue->get('Pages');
if (!$pages instanceof ElementMissing) {
return [];
}

return [new Page($this, new Header([], $this), '')];
}

/**
* @return array<Page>
*/
protected function getXrefRootMissingFallbackPages(): array
{
if (
!$this->hasObjectsByType('XRef')
|| $this->hasObjectsByType('Catalog')
|| $this->hasObjectsByType('Pages')
|| $this->hasObjectsByType('Page')
) {
return [];
}

if (!$this->trailer->has('Root') || !$this->trailer->get('Root') instanceof ElementMissing) {
return [];
}

return [new Page($this, new Header([], $this), '')];
}

/**
* @return array<Page>
*/
protected function getCatalogMissingPagesFallbackPages(): array
{
if (!$this->hasObjectsByType('Catalog')) {
return [];
}

$catalogues = $this->getObjectsByType('Catalog');
$catalogue = reset($catalogues);
if (false === $catalogue) {
return [];
}

if (!$catalogue->get('Pages') instanceof ElementMissing) {
return [];
}

return [new Page($this, new Header([], $this), '')];
}

/**
* @return array<Page>
*/
protected function getCatalogUnresolvablePagesFallbackPages(): array
{
if (!$this->hasObjectsByType('Catalog')) {
return [];
}

$catalogues = $this->getObjectsByType('Catalog');
$catalogue = reset($catalogues);
if (false === $catalogue) {
return [];
}

$pages = $catalogue->get('Pages');
if ($pages instanceof ElementMissing || $pages instanceof Pages) {
return [];
}

if (method_exists($pages, 'getPages')) {
try {
if ([] !== $pages->getPages(true)) {
return [];
}
} catch (\Throwable $e) {
}
}

return [new Page($this, new Header([], $this), '')];
}

/**
* @return array<Page>
*/
protected function getBrokenPagesTreeFallbackPages(): array
{
if (!$this->hasObjectsByType('Pages')) {
return [];
}

/** @var Pages[] $objects */
$objects = $this->getObjectsByType('Pages');
foreach ($objects as $object) {
if ([] !== $object->getPages(true)) {
return [];
}

$count = $object->getHeader()->get('Count');
if ($count instanceof ElementNumeric && $count->getContent() > 0) {
return [new Page($this, new Header([], $this), '')];
}
}

return [];
}

/**
* @return array<Page>
*/
protected function getMinimalHeaderlessStructureFallbackPages(): array
{
if (
$this->trailer->has('Root')
|| $this->hasObjectsByType('Catalog')
|| $this->hasObjectsByType('Pages')
|| $this->hasObjectsByType('Page')
||
\count($this->objects) > 2
|| [] === $this->objects
) {
return [];
}

foreach ($this->objects as $object) {
if ([] !== $object->getHeader()->getElements()) {
return [];
}
}

return [new Page($this, new Header([], $this), '')];
}

protected function headerContainsPageMarker(Header $header): bool
{
if ('Page' === $header->get('Type')->getContent()) {
return true;
}

foreach ($header->getElements() as $element) {
if ($element instanceof ElementName && 'Page' === $element->getContent()) {
return true;
}
}

throw new MissingCatalogException('Missing catalog.');
return false;
}

public function getText(?int $pageLimit = null): string
Expand Down
Loading
Loading