diff --git a/src/Smalot/PdfParser/RawData/FilterHelper.php b/src/Smalot/PdfParser/RawData/FilterHelper.php index 87f5524d..88c4f12a 100644 --- a/src/Smalot/PdfParser/RawData/FilterHelper.php +++ b/src/Smalot/PdfParser/RawData/FilterHelper.php @@ -264,10 +264,12 @@ protected function decodeFilterASCII85Decode(string $data): string */ protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string { + $effectiveDecodeMemoryLimit = $this->getEffectiveDecodeMemoryLimit($decodeMemoryLimit); + // Uncatchable E_WARNING for "data error" is @ suppressed // so execution may proceed with an alternate decompression // method. - $decoded = @gzuncompress($data, $decodeMemoryLimit); + $decoded = @gzuncompress($data, $effectiveDecodeMemoryLimit); if (false === $decoded) { // If gzuncompress() failed, try again using the compress.zlib:// @@ -278,10 +280,10 @@ protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit) if (false != $ztmp) { fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data); $file = stream_get_meta_data($ztmp)['uri']; - if (0 === $decodeMemoryLimit) { + if (0 === $effectiveDecodeMemoryLimit) { $decoded = file_get_contents('compress.zlib://'.$file); } else { - $decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit); + $decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $effectiveDecodeMemoryLimit); } fclose($ztmp); } @@ -295,6 +297,29 @@ protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit) return $decoded; } + private function getEffectiveDecodeMemoryLimit(int $decodeMemoryLimit): int + { + if ($decodeMemoryLimit > 0) { + return $decodeMemoryLimit; + } + + $memoryLimit = MemoryLimit::toBytes((string) ini_get('memory_limit')); + if ($memoryLimit <= 0) { + // Unlimited PHP memory limit. + return 0; + } + + // Keep substantial headroom because zlib decoding can transiently allocate + // more memory than the returned string. + $available = $memoryLimit - memory_get_usage(true); + if ($available <= (16 * 1024 * 1024)) { + return 1024 * 1024; + } + + $safeLimit = (int) floor(($available - (8 * 1024 * 1024)) / 2); + + return (int) min(max($safeLimit, 1024 * 1024), 256 * 1024 * 1024); + } /** * LZWDecode * diff --git a/src/Smalot/PdfParser/RawData/MemoryLimit.php b/src/Smalot/PdfParser/RawData/MemoryLimit.php new file mode 100644 index 00000000..8bc3a87f --- /dev/null +++ b/src/Smalot/PdfParser/RawData/MemoryLimit.php @@ -0,0 +1,45 @@ + + * + * @date 2026-04-24 + * + * @license LGPLv3 + * + * @url + */ + +namespace Smalot\PdfParser\RawData; + +final class MemoryLimit +{ + /** + * Converts PHP ini memory values (for example "128M", "1G", "-1") to bytes. + */ + public static function toBytes(string $value): int + { + $value = trim($value); + if ('' === $value || '-1' === $value) { + return -1; + } + + $unit = strtolower(substr($value, -1)); + $number = (int) $value; + switch ($unit) { + case 'g': + return $number * 1024 * 1024 * 1024; + + case 'm': + return $number * 1024 * 1024; + + case 'k': + return $number * 1024; + + default: + return (int) $value; + } + } +} diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ec8d01e5..12fdc2b6 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -778,7 +778,9 @@ protected function getRawObject(string $pdfData, int $offset = 0, ?array $header // we get stream length here to later help preg_match test less data $streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0); - $skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/'); + $skip = (false === $this->config->getRetainImageContent() || $this->shouldSkipImageStreamContent($headerDic)) + && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') + && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/'); $pregResult = preg_match( '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU', @@ -819,6 +821,40 @@ protected function getRawObject(string $pdfData, int $offset = 0, ?array $header return [$objtype, $objval, $offset]; } + private function shouldSkipImageStreamContent(?array $headerDic): bool + { + if (false === \is_array($headerDic)) { + return false; + } + + $memoryLimit = $this->getMemoryLimitBytes(); + if ($memoryLimit <= 0) { + return false; + } + + if ('XObject' != $this->getHeaderValue($headerDic, 'Type', '/') || 'Image' != $this->getHeaderValue($headerDic, 'Subtype', '/')) { + return false; + } + + if ($memoryLimit <= (256 * 1024 * 1024)) { + return true; + } + + return memory_get_usage(true) >= (int) floor($memoryLimit * 0.8); + } + + private function getMemoryLimitBytes(): int + { + static $memoryLimit = null; + if (null !== $memoryLimit) { + return $memoryLimit; + } + + $memoryLimit = MemoryLimit::toBytes((string) ini_get('memory_limit')); + + return $memoryLimit; + } + /** * Get value of an object header's section (obj << YYY >> part ). * diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 7c7fe7e6..82dad54f 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -36,6 +36,7 @@ namespace PHPUnitTests\Integration; use PHPUnitTests\TestCase; +use Smalot\PdfParser\Config; use Smalot\PdfParser\Document; use Smalot\PdfParser\Parser; @@ -111,4 +112,17 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } + + /** + * @group linux-only + */ + public function testParseFileWithLargeFlateStreams(): void + { + $config = new Config(); + $config->setRetainImageContent(false); + $config->setDecodeMemoryLimit(8 * 1024 * 1024); + $document = (new Parser([], $config))->parseFile($this->rootDir.'/samples/bugs/PullRequest457.pdf'); + + self::assertCount(28, $document->getPages()); + } } diff --git a/tests/PHPUnit/Integration/PageTest.php b/tests/PHPUnit/Integration/PageTest.php index 33751e59..496a280f 100644 --- a/tests/PHPUnit/Integration/PageTest.php +++ b/tests/PHPUnit/Integration/PageTest.php @@ -147,6 +147,7 @@ public function testGetText(): void /** * @group memory-heavy + * @group linux-only * * @see https://github.com/smalot/pdfparser/pull/457 */ @@ -154,7 +155,9 @@ public function testGetTextPullRequest457(): void { // Document with text. $filename = $this->rootDir.'/samples/bugs/PullRequest457.pdf'; - $parser = $this->getParserInstance(); + $config = new Config(); + $config->setRetainImageContent(false); + $parser = $this->getParserInstance($config); $document = $parser->parseFile($filename); $pages = $document->getPages(); $page = $pages[0]; diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index 046bf431..4489c320 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -54,6 +54,7 @@ protected function setUp(): void * Notice: it may fail to run in Scrutinizer because of memory limitations. * * @group memory-heavy + * @group linux-only */ public function testParseFile(): void { @@ -375,8 +376,7 @@ public function testRetainImageContentImpact(): void $document = $this->fixture->parseFile($filename); } - $usedMemory = memory_get_usage(true); - $this->assertGreaterThan($baselineMemory + 180000000, $usedMemory, 'Memory is only '.$usedMemory); + $memoryWithRetainedImages = memory_get_usage(true); $this->assertTrue(null != $document && '' !== $document->getText()); // force garbage collection @@ -395,12 +395,13 @@ public function testRetainImageContentImpact(): void $document = $this->fixture->parseFile($filename); } - $usedMemory = memory_get_usage(true); - /* - * note: the following memory value is set manually and may differ from system to system. - * it must be high enough to not produce a false negative though. - */ - $this->assertLessThan($baselineMemory * 1.05, $usedMemory, 'Memory is '.$usedMemory); + $memoryWithoutRetainedImages = memory_get_usage(true); + $this->assertLessThanOrEqual( + $memoryWithRetainedImages, + $memoryWithoutRetainedImages, + 'Discarding image content should not use more memory than retaining it.' + ); + $this->assertGreaterThanOrEqual($baselineMemory, $memoryWithoutRetainedImages); $this->assertTrue('' !== $document->getText()); } diff --git a/tests/PHPUnit/TestCase.php b/tests/PHPUnit/TestCase.php index 08d4739a..bb40dfc3 100644 --- a/tests/PHPUnit/TestCase.php +++ b/tests/PHPUnit/TestCase.php @@ -57,6 +57,19 @@ protected function setUp(): void $this->rootDir = __DIR__.'/../..'; } + protected function tearDown(): void + { + $this->fixture = null; + $this->rootDir = null; + + \gc_collect_cycles(); + if (\function_exists('gc_mem_caches')) { + \gc_mem_caches(); + } + + parent::tearDown(); + } + protected function getDocumentInstance(): Document { return new Document(); diff --git a/tests/PHPUnit/Unit/MemoryLimitTest.php b/tests/PHPUnit/Unit/MemoryLimitTest.php new file mode 100644 index 00000000..53088ec1 --- /dev/null +++ b/tests/PHPUnit/Unit/MemoryLimitTest.php @@ -0,0 +1,46 @@ + + * + * @date 2026-04-24 + * + * @license LGPLv3 + * + * @url + */ + +namespace PHPUnitTests\Unit; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\RawData\MemoryLimit; + +class MemoryLimitTest extends TestCase +{ + /** + * @dataProvider toBytesProvider + */ + public function testToBytes(string $input, int $expected): void + { + $this->assertSame($expected, MemoryLimit::toBytes($input)); + } + + /** + * @return array + */ + public static function toBytesProvider(): array + { + return [ + 'gigabytes' => ['1G', 1073741824], + 'megabytes' => ['256M', 268435456], + 'kilobytes' => ['64K', 65536], + 'without unit' => ['2048', 2048], + 'trimmed value' => [' 32M ', 33554432], + 'lowercase unit' => ['1m', 1048576], + 'unlimited value' => ['-1', -1], + 'empty value' => ['', -1], + ]; + } +}