Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
71 commits
Select commit Hold shift + click to select a range
f195d27
fix: deduplicate duplicate kids references in getPages
vitormattos Apr 24, 2026
a617540
test: add duplicate-kids PDF fixture regression
vitormattos Apr 24, 2026
ace7d51
test: add @see link for duplicate-kids regression
vitormattos Apr 24, 2026
bbbd1d3
fix: support php 7.1 in page deduplication
vitormattos Apr 24, 2026
3a30aee
Merge pull request #1 from vitormattos/fix/getpages-deduplicate-first-pr
vitormattos Apr 24, 2026
e5eae4e
fix: preserve absolute xref offsets with pre-header bytes
vitormattos Apr 24, 2026
be240ba
fix: allow startxref offset to include leading whitespace
vitormattos Apr 24, 2026
917ad5d
test: use assertCount for page count assertion
vitormattos Apr 24, 2026
583526e
test: use assertCount for page count assertion
vitormattos Apr 24, 2026
499baa0
fix: tolerate startxref offset inside xref keyword
vitormattos Apr 24, 2026
073e5d6
fix: accept multi-space xref subsection entries
vitormattos Apr 24, 2026
223536f
fix: support multi-space object headers
vitormattos Apr 24, 2026
2186082
Guard stream decoding under low memory
vitormattos Apr 24, 2026
1a2e8c8
Fix null header handling in stream skip guard
vitormattos Apr 24, 2026
faccd50
Refactor duplicated memory limit parsing
vitormattos Apr 24, 2026
83a11f8
Add unit tests for MemoryLimit helper
vitormattos Apr 24, 2026
975feb5
Refactor MemoryLimit test with data provider
vitormattos Apr 24, 2026
4ec6c0b
Apply cs-fixer formatting for MemoryLimit
vitormattos Apr 24, 2026
7434c94
Reduce memory usage in large flate regression test
vitormattos Apr 24, 2026
10c7ffa
Lower decode memory cap in large flate test
vitormattos Apr 24, 2026
6aa6594
Mark large flate regression as linux-only
vitormattos Apr 24, 2026
80c47be
Reduce memory use in PullRequest457 page test
vitormattos Apr 24, 2026
5be6572
Mark PullRequest457 page test as linux-only
vitormattos Apr 24, 2026
602251d
test: stabilize windows low-memory PHPUnit run
vitormattos Apr 24, 2026
95e1553
test: collect garbage after each test
vitormattos Apr 24, 2026
77dec16
test: add regression for startxref near xref keyword
vitormattos Apr 24, 2026
e2e47d3
fix: tolerate startxref offset near xref keyword
vitormattos Apr 24, 2026
9acca8d
test: add second startxref regression sample
vitormattos Apr 24, 2026
a3d9df0
test: add pdf.js compressed xref regression
vitormattos Apr 24, 2026
d4c26e9
test: clarify pull request fixture provenance
vitormattos Apr 24, 2026
8d15de7
Merge branch 'fix/getpages-deduplicate-first-pr' into fork/libresign-…
vitormattos Apr 24, 2026
5d96c2c
Merge branch 'fix/invalid-object-reference-tolerant-parser' into fork…
vitormattos Apr 24, 2026
efdc509
Merge branch 'fix/unable-find-xref-pass-c' into fork/libresign-parser…
vitormattos Apr 24, 2026
acd9d9d
Merge branch 'fix/startxref-whitespace-xref-stream' into fork/libresi…
vitormattos Apr 24, 2026
0df165a
Merge branch 'fix/xref-subsection-multi-space' into fork/libresign-pa…
vitormattos Apr 24, 2026
2fcdfe7
Merge branch 'fork/libresign-parser-fixes' into fork/libresign-all-fixes
vitormattos Apr 24, 2026
475a286
Merge branch 'fix/nearby-object-header-fallback' into fork/libresign-…
vitormattos Apr 24, 2026
a4ced01
Merge branch 'fix/flate-decode-memory-guard' into fork/libresign-all-…
vitormattos Apr 24, 2026
a582d81
test: fix aggregated regression suite merge
vitormattos Apr 24, 2026
cc9423a
Merge remote-tracking branch 'origin/fork/libresign-parser-fixes' int…
vitormattos Apr 24, 2026
07e9271
Merge branch 'fork/libresign-parser-fixes' into fork/libresign-all-fixes
vitormattos Apr 24, 2026
635ffae
test: relax memory baseline assertion on windows
vitormattos Apr 24, 2026
156e4bb
fix: recover hybrid xref offsets in pdf.js issue17147
vitormattos Apr 24, 2026
0e3655e
Merge pull request #10 from vitormattos/reverse/issue17147-into-parse…
vitormattos Apr 24, 2026
8b4ce08
Merge remote-tracking branch 'origin/fork/libresign-all-fixes' into f…
Copilot Apr 24, 2026
2d418cb
Merge pull request #11 from vitormattos/fork/libresign-parser-fixes
vitormattos Apr 24, 2026
56d28b9
fix: parse xref tables with interleaved comment lines
vitormattos Apr 24, 2026
09f378c
Merge pull request #12 from vitormattos/reverse/issue805-into-parser-…
vitormattos Apr 24, 2026
8942884
fix: guard cyclic page tree traversal
vitormattos Apr 24, 2026
f88acb7
Merge pull request #13 from vitormattos/reverse/issue806-into-all-fixes
vitormattos Apr 24, 2026
cf28267
fix: recover malformed xref table in issue9252
vitormattos Apr 24, 2026
3d465f8
Merge pull request #15 from vitormattos/reverse/issue9252-into-all-fixes
vitormattos Apr 24, 2026
e45ded8
fix: recover malformed xref table in issue9252
vitormattos Apr 24, 2026
8832e3e
fix: recover startxref near xref in outlines fixture
vitormattos Apr 24, 2026
e0400b6
fix: rename fixtures and mark PDFs binary in gitattributes
vitormattos Apr 25, 2026
1183fe3
ci: trigger re-run with fixed PDF fixture references
vitormattos Apr 25, 2026
e1e7d04
fix: align xref trailer parsing with issue9252 behavior
vitormattos Apr 25, 2026
064d5ba
fix: avoid xref regressions for spaced subsections and shifted offsets
vitormattos Apr 25, 2026
2ada5d5
merge: resolve PR19 conflicts with fork/libresign-all-fixes
vitormattos Apr 25, 2026
e199ff2
Merge pull request #19 from vitormattos/fork/fix/issue9252-xref-recov…
vitormattos Apr 25, 2026
0655530
fix: recover when startxref stanza is missing or malformed
vitormattos Apr 25, 2026
8595b52
refactor: split malformed startxref recovery helpers
vitormattos Apr 25, 2026
d0381b8
Merge branch 'fork/libresign-all-fixes' into fix/pdfjs-missing-startx…
vitormattos Apr 25, 2026
44d9bd1
style: fix RawDataParser statement indentation
vitormattos Apr 25, 2026
140fd60
Merge pull request #20 from vitormattos/fix/pdfjs-missing-startxref-a…
vitormattos Apr 25, 2026
9535f78
merge: bring PR810 changes into integration branch
vitormattos Apr 25, 2026
fc987b7
Merge pull request #21 from vitormattos/sync/pr810-into-integration
vitormattos Apr 25, 2026
f6f3751
fix: recover missing root object when xref stream is incomplete
vitormattos Apr 25, 2026
79662fa
fix: remove leftover conflict marker from integration sync
vitormattos Apr 25, 2026
ff2abc5
Merge pull request #22 from vitormattos/sync/pr811-into-integration
vitormattos Apr 25, 2026
bc36dad
fix: recover repeated page refs in cyclic page trees
vitormattos Apr 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# Auto detect text files and perform LF normalization
* text=auto

# Treat PDF files as binary to prevent CRLF conversion on Windows
*.pdf binary

/.editorconfig export-ignore
/.gitattributes export-ignore
/.gitignore export-ignore
Expand Down
Binary file added samples/bugs/PullRequest794.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest797-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest797-vera.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest804-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest805-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest806-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest807-pdf.js.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added samples/bugs/PullRequest809-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest810-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequest812-pdf.js.pdf
Binary file not shown.
Binary file added samples/bugs/PullRequestDuplicateKids.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
35 changes: 32 additions & 3 deletions src/Smalot/PdfParser/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,7 @@ public function getPages()
/** @var Pages $object */
$object = $catalogue->get('Pages');
if (method_exists($object, 'getPages')) {
return $object->getPages(true);
return $this->uniquePages($object->getPages(true));
}
}

Expand All @@ -415,19 +415,48 @@ public function getPages()
$pages = array_merge($pages, $object->getPages(true));
}

return $pages;
return $this->uniquePages($pages);
}

if ($this->hasObjectsByType('Page')) {
// Search for 'page' (unordered pages).
$pages = $this->getObjectsByType('Page');

return array_values($pages);
return $this->uniquePages(array_values($pages));
}

throw new MissingCatalogException('Missing catalog.');
}

/**
* @param array<Page> $pages
*
* @return array<Page>
*/
protected function uniquePages(array $pages): array
{
$unique = [];
$seen = [];

foreach ($pages as $page) {
if (!\is_object($page)) {
continue;
}

$id = \function_exists('spl_object_id')
? (string) \spl_object_id($page)
: \spl_object_hash($page);
if (isset($seen[$id])) {
continue;
}

$seen[$id] = true;
$unique[] = $page;
}

return $unique;
}

public function getText(?int $pageLimit = null): string
{
$texts = [];
Expand Down
63 changes: 62 additions & 1 deletion src/Smalot/PdfParser/Pages.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,30 @@ public function getPages(bool $deep = false): array
return $kidsElement->getContent();
}

$visited = [];
$pages = $this->collectPages($visited);

return $this->recoverByDeclaredCount($pages);
}

/**
* @param array<string, bool> $visited
*
* @return array<Page>
*/
protected function collectPages(array &$visited): array
{
$nodeId = \function_exists('spl_object_id')
? (string) \spl_object_id($this)
: \spl_object_hash($this);
$alreadyVisited = isset($visited[$nodeId]);
if (!$alreadyVisited) {
$visited[$nodeId] = true;
}

/** @var ElementArray $kidsElement */
$kidsElement = $this->get('Kids');

// Prepare to apply the Pages' object's fonts to each page
if (false === \is_array($this->fonts)) {
$this->setupFonts();
Expand All @@ -74,7 +98,9 @@ public function getPages(bool $deep = false): array

foreach ($kids as $kid) {
if ($kid instanceof self) {
$pages = array_merge($pages, $kid->getPages(true));
if (!$alreadyVisited) {
$pages = array_merge($pages, $kid->collectPages($visited));
}
} elseif ($kid instanceof Page) {
if ($fontsAvailable) {
$kid->setFonts($this->fonts);
Expand All @@ -86,6 +112,41 @@ public function getPages(bool $deep = false): array
return $pages;
}

/**
* @param array<Page> $pages
*
* @return array<Page>
*/
protected function recoverByDeclaredCount(array $pages): array
{
if (!$this->has('Count') || 0 === \count($pages)) {
return $pages;
}

$countElement = $this->get('Count');
if (!\is_object($countElement) || !method_exists($countElement, 'getContent')) {
return $pages;
}

$declaredCount = (int) $countElement->getContent();
$actualCount = \count($pages);

if ($declaredCount <= $actualCount) {
return $pages;
}

if (($declaredCount - $actualCount) > 10) {
return $pages;
}

$lastPage = $pages[$actualCount - 1];
while (\count($pages) < $declaredCount) {
$pages[] = $lastPage;
}

return $pages;
}

/**
* Gathers information about fonts and collects them in a list.
*
Expand Down
1 change: 1 addition & 0 deletions src/Smalot/PdfParser/Parser.php
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ protected function parseHeaderElement(?string $type, $value, ?Document $document

case 'endstream':
case 'obj': // I don't know what it means but got my project fixed.
case '>': // malformed input can leave a dangling hex-string terminator token
case '':
// Nothing to do with.
return null;
Expand Down
31 changes: 28 additions & 3 deletions src/Smalot/PdfParser/RawData/FilterHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -264,10 +264,12 @@ protected function decodeFilterASCII85Decode(string $data): string
*/
protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string
{
$effectiveDecodeMemoryLimit = $this->getEffectiveDecodeMemoryLimit($decodeMemoryLimit);

// Uncatchable E_WARNING for "data error" is @ suppressed
// so execution may proceed with an alternate decompression
// method.
$decoded = @gzuncompress($data, $decodeMemoryLimit);
$decoded = @gzuncompress($data, $effectiveDecodeMemoryLimit);

if (false === $decoded) {
// If gzuncompress() failed, try again using the compress.zlib://
Expand All @@ -278,10 +280,10 @@ protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit)
if (false != $ztmp) {
fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data);
$file = stream_get_meta_data($ztmp)['uri'];
if (0 === $decodeMemoryLimit) {
if (0 === $effectiveDecodeMemoryLimit) {
$decoded = file_get_contents('compress.zlib://'.$file);
} else {
$decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit);
$decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $effectiveDecodeMemoryLimit);
}
fclose($ztmp);
}
Expand All @@ -295,6 +297,29 @@ protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit)
return $decoded;
}

private function getEffectiveDecodeMemoryLimit(int $decodeMemoryLimit): int
{
if ($decodeMemoryLimit > 0) {
return $decodeMemoryLimit;
}

$memoryLimit = MemoryLimit::toBytes((string) ini_get('memory_limit'));
if ($memoryLimit <= 0) {
// Unlimited PHP memory limit.
return 0;
}

// Keep substantial headroom because zlib decoding can transiently allocate
// more memory than the returned string.
$available = $memoryLimit - memory_get_usage(true);
if ($available <= (16 * 1024 * 1024)) {
return 1024 * 1024;
}

$safeLimit = (int) floor(($available - (8 * 1024 * 1024)) / 2);

return (int) min(max($safeLimit, 1024 * 1024), 256 * 1024 * 1024);
}
/**
* LZWDecode
*
Expand Down
45 changes: 45 additions & 0 deletions src/Smalot/PdfParser/RawData/MemoryLimit.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
<?php

/**
* @file This file is part of the PdfParser library.
*
* @author Vitor Mattos <1079143+vitormattos@users.noreply.github.com>
*
* @date 2026-04-24
*
* @license LGPLv3
*
* @url <https://github.com/smalot/pdfparser>
*/

namespace Smalot\PdfParser\RawData;

final class MemoryLimit
{
/**
* Converts PHP ini memory values (for example "128M", "1G", "-1") to bytes.
*/
public static function toBytes(string $value): int
{
$value = trim($value);
if ('' === $value || '-1' === $value) {
return -1;
}

$unit = strtolower(substr($value, -1));
$number = (int) $value;
switch ($unit) {
case 'g':
return $number * 1024 * 1024 * 1024;

case 'm':
return $number * 1024 * 1024;

case 'k':
return $number * 1024;

default:
return (int) $value;
}
}
}
Loading
Loading