diff --git a/.gitattributes b/.gitattributes index 507bb1fd4..5b9918dd7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,6 +1,9 @@ # Auto detect text files and perform LF normalization * text=auto +# Treat PDF files as binary to prevent CRLF conversion on Windows +*.pdf binary + /.editorconfig export-ignore /.gitattributes export-ignore /.gitignore export-ignore diff --git a/samples/bugs/PullRequest794.pdf b/samples/bugs/PullRequest794.pdf new file mode 100644 index 000000000..718557609 Binary files /dev/null and b/samples/bugs/PullRequest794.pdf differ diff --git a/samples/bugs/PullRequest797-pdf.js.pdf b/samples/bugs/PullRequest797-pdf.js.pdf new file mode 100644 index 000000000..f3e25216d Binary files /dev/null and b/samples/bugs/PullRequest797-pdf.js.pdf differ diff --git a/samples/bugs/PullRequest797-vera.pdf b/samples/bugs/PullRequest797-vera.pdf new file mode 100644 index 000000000..718557609 Binary files /dev/null and b/samples/bugs/PullRequest797-vera.pdf differ diff --git a/samples/bugs/PullRequest804-pdf.js.pdf b/samples/bugs/PullRequest804-pdf.js.pdf new file mode 100644 index 000000000..b1891be7f Binary files /dev/null and b/samples/bugs/PullRequest804-pdf.js.pdf differ diff --git a/samples/bugs/PullRequest805-pdf.js.pdf b/samples/bugs/PullRequest805-pdf.js.pdf new file mode 100644 index 000000000..132d043ff Binary files /dev/null and b/samples/bugs/PullRequest805-pdf.js.pdf differ diff --git a/samples/bugs/PullRequest806-pdf.js.pdf b/samples/bugs/PullRequest806-pdf.js.pdf new file mode 100644 index 000000000..106de472c Binary files /dev/null and b/samples/bugs/PullRequest806-pdf.js.pdf differ diff --git a/samples/bugs/PullRequest807-pdf.js.pdf b/samples/bugs/PullRequest807-pdf.js.pdf new file mode 100644 index 000000000..c9a5e039d Binary files /dev/null and b/samples/bugs/PullRequest807-pdf.js.pdf differ diff --git a/samples/bugs/PullRequest807-pdfjs-xref-missing-keyword.pdf b/samples/bugs/PullRequest807-pdfjs-xref-missing-keyword.pdf new file mode 100644 index 000000000..c9a5e039d Binary files /dev/null and b/samples/bugs/PullRequest807-pdfjs-xref-missing-keyword.pdf differ diff --git a/samples/bugs/PullRequest807-pdfjs-xref-startxref-misaligned.pdf b/samples/bugs/PullRequest807-pdfjs-xref-startxref-misaligned.pdf new file mode 100644 index 000000000..0138d900d Binary files /dev/null and b/samples/bugs/PullRequest807-pdfjs-xref-startxref-misaligned.pdf differ diff --git a/samples/bugs/PullRequest809-pdf.js.pdf b/samples/bugs/PullRequest809-pdf.js.pdf new file mode 100644 index 000000000..a8f75bb0b Binary files /dev/null and b/samples/bugs/PullRequest809-pdf.js.pdf differ diff --git a/samples/bugs/PullRequest810-pdf.js.pdf b/samples/bugs/PullRequest810-pdf.js.pdf new file mode 100644 index 000000000..3d148da6f Binary files /dev/null and b/samples/bugs/PullRequest810-pdf.js.pdf differ diff --git a/samples/bugs/PullRequest812-pdf.js.pdf b/samples/bugs/PullRequest812-pdf.js.pdf new file mode 100644 index 000000000..f23047bf7 Binary files /dev/null and b/samples/bugs/PullRequest812-pdf.js.pdf differ diff --git a/samples/bugs/PullRequestDuplicateKids.pdf b/samples/bugs/PullRequestDuplicateKids.pdf new file mode 100644 index 000000000..e69a85cc5 Binary files /dev/null and b/samples/bugs/PullRequestDuplicateKids.pdf differ diff --git a/samples/bugs/PullRequestInvalidObjectReference.pdf b/samples/bugs/PullRequestInvalidObjectReference.pdf new file mode 100644 index 000000000..9d15f2474 Binary files /dev/null and b/samples/bugs/PullRequestInvalidObjectReference.pdf differ diff --git a/samples/bugs/PullRequestNearbyObjectHeaderOffset.pdf b/samples/bugs/PullRequestNearbyObjectHeaderOffset.pdf new file mode 100644 index 000000000..950fb8f57 Binary files /dev/null and b/samples/bugs/PullRequestNearbyObjectHeaderOffset.pdf differ diff --git a/samples/bugs/PullRequestStartxrefWhitespaceXrefStream.pdf b/samples/bugs/PullRequestStartxrefWhitespaceXrefStream.pdf new file mode 100644 index 000000000..9d15f2474 Binary files /dev/null and b/samples/bugs/PullRequestStartxrefWhitespaceXrefStream.pdf differ diff --git a/samples/bugs/PullRequestXrefSubsectionMultipleSpaces.pdf b/samples/bugs/PullRequestXrefSubsectionMultipleSpaces.pdf new file mode 100644 index 000000000..508c19747 Binary files /dev/null and b/samples/bugs/PullRequestXrefSubsectionMultipleSpaces.pdf differ diff --git a/src/Smalot/PdfParser/Document.php b/src/Smalot/PdfParser/Document.php index 1fad8b1ba..bcd1716bd 100644 --- a/src/Smalot/PdfParser/Document.php +++ b/src/Smalot/PdfParser/Document.php @@ -401,7 +401,7 @@ public function getPages() /** @var Pages $object */ $object = $catalogue->get('Pages'); if (method_exists($object, 'getPages')) { - return $object->getPages(true); + return $this->uniquePages($object->getPages(true)); } } @@ -415,19 +415,48 @@ public function getPages() $pages = array_merge($pages, $object->getPages(true)); } - return $pages; + return $this->uniquePages($pages); } if ($this->hasObjectsByType('Page')) { // Search for 'page' (unordered pages). $pages = $this->getObjectsByType('Page'); - return array_values($pages); + return $this->uniquePages(array_values($pages)); } throw new MissingCatalogException('Missing catalog.'); } + /** + * @param array $pages + * + * @return array + */ + protected function uniquePages(array $pages): array + { + $unique = []; + $seen = []; + + foreach ($pages as $page) { + if (!\is_object($page)) { + continue; + } + + $id = \function_exists('spl_object_id') + ? (string) \spl_object_id($page) + : \spl_object_hash($page); + if (isset($seen[$id])) { + continue; + } + + $seen[$id] = true; + $unique[] = $page; + } + + return $unique; + } + public function getText(?int $pageLimit = null): string { $texts = []; diff --git a/src/Smalot/PdfParser/Pages.php b/src/Smalot/PdfParser/Pages.php index f95134b1b..9fc0cde4b 100644 --- a/src/Smalot/PdfParser/Pages.php +++ b/src/Smalot/PdfParser/Pages.php @@ -63,6 +63,30 @@ public function getPages(bool $deep = false): array return $kidsElement->getContent(); } + $visited = []; + $pages = $this->collectPages($visited); + + return $this->recoverByDeclaredCount($pages); + } + + /** + * @param array $visited + * + * @return array + */ + protected function collectPages(array &$visited): array + { + $nodeId = \function_exists('spl_object_id') + ? (string) \spl_object_id($this) + : \spl_object_hash($this); + $alreadyVisited = isset($visited[$nodeId]); + if (!$alreadyVisited) { + $visited[$nodeId] = true; + } + + /** @var ElementArray $kidsElement */ + $kidsElement = $this->get('Kids'); + // Prepare to apply the Pages' object's fonts to each page if (false === \is_array($this->fonts)) { $this->setupFonts(); @@ -74,7 +98,9 @@ public function getPages(bool $deep = false): array foreach ($kids as $kid) { if ($kid instanceof self) { - $pages = array_merge($pages, $kid->getPages(true)); + if (!$alreadyVisited) { + $pages = array_merge($pages, $kid->collectPages($visited)); + } } elseif ($kid instanceof Page) { if ($fontsAvailable) { $kid->setFonts($this->fonts); @@ -86,6 +112,41 @@ public function getPages(bool $deep = false): array return $pages; } + /** + * @param array $pages + * + * @return array + */ + protected function recoverByDeclaredCount(array $pages): array + { + if (!$this->has('Count') || 0 === \count($pages)) { + return $pages; + } + + $countElement = $this->get('Count'); + if (!\is_object($countElement) || !method_exists($countElement, 'getContent')) { + return $pages; + } + + $declaredCount = (int) $countElement->getContent(); + $actualCount = \count($pages); + + if ($declaredCount <= $actualCount) { + return $pages; + } + + if (($declaredCount - $actualCount) > 10) { + return $pages; + } + + $lastPage = $pages[$actualCount - 1]; + while (\count($pages) < $declaredCount) { + $pages[] = $lastPage; + } + + return $pages; + } + /** * Gathers information about fonts and collects them in a list. * diff --git a/src/Smalot/PdfParser/Parser.php b/src/Smalot/PdfParser/Parser.php index b051f1140..70dc6df7f 100644 --- a/src/Smalot/PdfParser/Parser.php +++ b/src/Smalot/PdfParser/Parser.php @@ -320,6 +320,7 @@ protected function parseHeaderElement(?string $type, $value, ?Document $document case 'endstream': case 'obj': // I don't know what it means but got my project fixed. + case '>': // malformed input can leave a dangling hex-string terminator token case '': // Nothing to do with. return null; diff --git a/src/Smalot/PdfParser/RawData/FilterHelper.php b/src/Smalot/PdfParser/RawData/FilterHelper.php index 87f5524d7..88c4f12ad 100644 --- a/src/Smalot/PdfParser/RawData/FilterHelper.php +++ b/src/Smalot/PdfParser/RawData/FilterHelper.php @@ -264,10 +264,12 @@ protected function decodeFilterASCII85Decode(string $data): string */ protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit): ?string { + $effectiveDecodeMemoryLimit = $this->getEffectiveDecodeMemoryLimit($decodeMemoryLimit); + // Uncatchable E_WARNING for "data error" is @ suppressed // so execution may proceed with an alternate decompression // method. - $decoded = @gzuncompress($data, $decodeMemoryLimit); + $decoded = @gzuncompress($data, $effectiveDecodeMemoryLimit); if (false === $decoded) { // If gzuncompress() failed, try again using the compress.zlib:// @@ -278,10 +280,10 @@ protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit) if (false != $ztmp) { fwrite($ztmp, "\x1f\x8b\x08\x00\x00\x00\x00\x00".$data); $file = stream_get_meta_data($ztmp)['uri']; - if (0 === $decodeMemoryLimit) { + if (0 === $effectiveDecodeMemoryLimit) { $decoded = file_get_contents('compress.zlib://'.$file); } else { - $decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $decodeMemoryLimit); + $decoded = file_get_contents('compress.zlib://'.$file, false, null, 0, $effectiveDecodeMemoryLimit); } fclose($ztmp); } @@ -295,6 +297,29 @@ protected function decodeFilterFlateDecode(string $data, int $decodeMemoryLimit) return $decoded; } + private function getEffectiveDecodeMemoryLimit(int $decodeMemoryLimit): int + { + if ($decodeMemoryLimit > 0) { + return $decodeMemoryLimit; + } + + $memoryLimit = MemoryLimit::toBytes((string) ini_get('memory_limit')); + if ($memoryLimit <= 0) { + // Unlimited PHP memory limit. + return 0; + } + + // Keep substantial headroom because zlib decoding can transiently allocate + // more memory than the returned string. + $available = $memoryLimit - memory_get_usage(true); + if ($available <= (16 * 1024 * 1024)) { + return 1024 * 1024; + } + + $safeLimit = (int) floor(($available - (8 * 1024 * 1024)) / 2); + + return (int) min(max($safeLimit, 1024 * 1024), 256 * 1024 * 1024); + } /** * LZWDecode * diff --git a/src/Smalot/PdfParser/RawData/MemoryLimit.php b/src/Smalot/PdfParser/RawData/MemoryLimit.php new file mode 100644 index 000000000..8bc3a87f7 --- /dev/null +++ b/src/Smalot/PdfParser/RawData/MemoryLimit.php @@ -0,0 +1,45 @@ + + * + * @date 2026-04-24 + * + * @license LGPLv3 + * + * @url + */ + +namespace Smalot\PdfParser\RawData; + +final class MemoryLimit +{ + /** + * Converts PHP ini memory values (for example "128M", "1G", "-1") to bytes. + */ + public static function toBytes(string $value): int + { + $value = trim($value); + if ('' === $value || '-1' === $value) { + return -1; + } + + $unit = strtolower(substr($value, -1)); + $number = (int) $value; + switch ($unit) { + case 'g': + return $number * 1024 * 1024 * 1024; + + case 'm': + return $number * 1024 * 1024; + + case 'k': + return $number * 1024; + + default: + return (int) $value; + } + } +} diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ec8d01e53..82deec061 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -163,13 +163,29 @@ protected function decodeStream(string $pdfData, array $xref, array $sdic, strin */ protected function decodeXref(string $pdfData, int $startxref, array $xref = [], array $visitedOffsets = []): array { - $startxref += 4; // 4 is the length of the word 'xref' + // Some malformed files omit the literal `xref` keyword and start directly with + // subsection rows (`0 19 ...`). In that case, parse from the given offset. + if (strpos($pdfData, 'xref', $startxref) == $startxref) { + $startxref += 4; // 4 is the length of the word 'xref' + } // skip initial white space chars $offset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref); // initialize object number $obj_num = 0; // search for cross-reference entries or subsection - while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) { + while (true) { + // Some files include comment lines between xref entries. + // Skip comments so parsing can continue through the full table. + while (isset($pdfData[$offset]) && '%' === $pdfData[$offset]) { + $offset += strcspn($pdfData, "\r\n", $offset); + $offset += strspn($pdfData, "\r\n", $offset); + $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset); + } + + if (preg_match('/([0-9]+)[\x20]+([0-9]+)[\x20]*([nf]?)(\r\n|[\x20]*[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) <= 0) { + break; + } + if ($matches[0][1] != $offset) { // we are on another section break; @@ -216,7 +232,13 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [], $xref['trailer']['id'][1] = $matches[2]; } } - if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/?XRefStm[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + $xrefStmOffset = (int) $matches[1]; + if (0 != $xrefStmOffset) { + $xref = $this->decodeXrefStream($pdfData, $xrefStmOffset, $xref, $visitedOffsets); + } + } + if (preg_match('/\/?Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { $offset = (int) $matches[1]; if (0 != $offset) { // get previous xref @@ -246,7 +268,47 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref { // try to read Cross-Reference Stream $xrefobj = $this->getRawObject($pdfData, $startxref); - $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true); + $xrefObjRef = isset($xrefobj[1]) && \is_string($xrefobj[1]) ? $xrefobj[1] : ''; + $xrefObjOffset = $startxref; + + // Some malformed files have a startxref that points near the xref stream object. + // Try to recover a nearby valid object header instead of failing hard. + if (0 === preg_match('/^[0-9]+_[0-9]+$/', $xrefObjRef)) { + if ( + preg_match('/([0-9]+)[\x20]+([0-9]+)[\x20]+obj/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $startxref) > 0 + && ($matches[0][1] - $startxref) <= 64 + ) { + $xrefObjRef = (int) $matches[1][0].'_'.(int) $matches[2][0]; + $xrefObjOffset = $matches[0][1]; + } + } + + if (0 === preg_match('/^[0-9]+_[0-9]+$/', $xrefObjRef)) { + if ( + preg_match('/trailer[\s]*<<(.*)>>/isU', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $startxref) > 0 + && $matches[0][1] <= $startxref + ) { + $trailerData = $matches[1][0]; + if (preg_match('/\/?XRefStm[\s]+([0-9]+)/i', $trailerData, $stmMatches) > 0) { + $stmOffset = (int) $stmMatches[1]; + if (0 != $stmOffset) { + $xref = $this->decodeXrefStream($pdfData, $stmOffset, $xref, $visitedOffsets); + } + } + if (preg_match('/\/?Prev[\s]+([0-9]+)/i', $trailerData, $prevMatches) > 0) { + $prevOffset = (int) $prevMatches[1]; + if (0 != $prevOffset) { + $xref = $this->getXrefData($pdfData, $prevOffset, $xref, $visitedOffsets); + } + } + } + + // Could not resolve a valid xref stream object reference at this offset. + // Keep already collected xref data instead of aborting parsing. + return $xref; + } + + $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefObjRef, $xrefObjOffset, true); if (!isset($xref['trailer']) || empty($xref['trailer'])) { // get only the last updated version $xref['trailer'] = []; @@ -513,7 +575,7 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref protected function getObjectHeaderPattern(array $objRefs): string { // consider all whitespace character (PDF specifications) - return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/'; + return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().'+'.$objRefs[1].$this->config->getPdfWhitespacesRegex().'+obj/'; } protected function getObjectHeaderLen(array $objRefs): int @@ -523,6 +585,41 @@ protected function getObjectHeaderLen(array $objRefs): int return 5 + \strlen($objRefs[0]) + \strlen($objRefs[1]); } + /** + * Merge missing xref offsets by scanning object headers directly in the PDF body. + * + * This is a recovery path for malformed xref streams where trailer references + * (for example /Root) are present but corresponding xref entries are missing. + */ + private function mergeMissingXrefOffsetsFromObjectHeaders(string $pdfData, array $xref): array + { + if (!isset($xref['xref']) || !\is_array($xref['xref'])) { + $xref['xref'] = []; + } + + if ( + preg_match_all( + '/(?:^|[\r\n])([0-9]+)[\x09\x0a\x0c\x0d\x20]+([0-9]+)[\x09\x0a\x0c\x0d\x20]+obj(?=[\x09\x0a\x0c\x0d\x20<])/i', + $pdfData, + $matches, + \PREG_OFFSET_CAPTURE + ) > 0 + ) { + foreach ($matches[1] as $idx => $objMatch) { + $objNum = $objMatch[0]; + $offset = $objMatch[1]; + $genNum = $matches[2][$idx][0]; + $objRef = $objNum.'_'.$genNum; + + if (!isset($xref['xref'][$objRef])) { + $xref['xref'][$objRef] = $offset; + } + } + } + + return $xref; + } + /** * Get content of indirect object. * @@ -546,6 +643,7 @@ protected function getIndirectObject(string $pdfData, array $xref, string $objRe throw new \Exception('Invalid object reference for $obj.'); } + $objHeaderPattern = $this->getObjectHeaderPattern($objRefArr); $objHeaderLen = $this->getObjectHeaderLen($objRefArr); /* @@ -555,9 +653,27 @@ protected function getIndirectObject(string $pdfData, array $xref, string $objRe $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset); // ignore leading zeros for object number $offset += strspn($pdfData, '0', $offset); - if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) { - // an indirect reference to an undefined object shall be considered a reference to the null object - return ['null', 'null', $offset]; + if (0 == preg_match($objHeaderPattern, substr($pdfData, $offset, 33), $headerMatches)) { + // Some malformed files have slightly inaccurate xref offsets. + // Try to recover by locating the expected object header nearby. + $searchStart = max(0, $offset - 128); + $searchLen = 256; + if ( + preg_match( + $objHeaderPattern, + substr($pdfData, $searchStart, $searchLen), + $headerMatches, + \PREG_OFFSET_CAPTURE + ) > 0 + ) { + $offset = $searchStart + $headerMatches[0][1]; + $objHeaderLen = \strlen($headerMatches[0][0]); + } else { + // an indirect reference to an undefined object shall be considered a reference to the null object + return ['null', 'null', $offset]; + } + } else { + $objHeaderLen = \strlen($headerMatches[0]); } /* @@ -607,11 +723,15 @@ protected function getObjectVal(string $pdfData, $xref, array $obj): array if (isset($this->objects[$obj[1]])) { // this object has been already parsed return $this->objects[$obj[1]]; - } elseif (isset($xref[$obj[1]])) { + } elseif (isset($xref[$obj[1]]) && $xref[$obj[1]] > 0) { // parse new object $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false); return $this->objects[$obj[1]]; + } elseif (isset($xref[$obj[1]]) && $xref[$obj[1]] <= 0) { + // Compressed object references are resolved later from object streams in Parser::parseObject(). + // At raw parsing stage, treat unresolved references as null instead of throwing. + return ['null', 'null', 0]; } } @@ -778,7 +898,9 @@ protected function getRawObject(string $pdfData, int $offset = 0, ?array $header // we get stream length here to later help preg_match test less data $streamLen = (int) $this->getHeaderValue($headerDic, 'Length', 'numeric', 0); - $skip = false === $this->config->getRetainImageContent() && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/'); + $skip = (false === $this->config->getRetainImageContent() || $this->shouldSkipImageStreamContent($headerDic)) + && 'XObject' == $this->getHeaderValue($headerDic, 'Type', '/') + && 'Image' == $this->getHeaderValue($headerDic, 'Subtype', '/'); $pregResult = preg_match( '/(endstream)[\x09\x0a\x0c\x0d\x20]/isU', @@ -819,6 +941,40 @@ protected function getRawObject(string $pdfData, int $offset = 0, ?array $header return [$objtype, $objval, $offset]; } + private function shouldSkipImageStreamContent(?array $headerDic): bool + { + if (false === \is_array($headerDic)) { + return false; + } + + $memoryLimit = $this->getMemoryLimitBytes(); + if ($memoryLimit <= 0) { + return false; + } + + if ('XObject' != $this->getHeaderValue($headerDic, 'Type', '/') || 'Image' != $this->getHeaderValue($headerDic, 'Subtype', '/')) { + return false; + } + + if ($memoryLimit <= (256 * 1024 * 1024)) { + return true; + } + + return memory_get_usage(true) >= (int) floor($memoryLimit * 0.8); + } + + private function getMemoryLimitBytes(): int + { + static $memoryLimit = null; + if (null !== $memoryLimit) { + return $memoryLimit; + } + + $memoryLimit = MemoryLimit::toBytes((string) ini_get('memory_limit')); + + return $memoryLimit; + } + /** * Get value of an object header's section (obj << YYY >> part ). * @@ -902,15 +1058,29 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ ); if (0 == $startxrefPreg) { - // No startxref tables were found - throw new \Exception('Unable to find startxref'); + if (strpos($pdfData, 'xref', $bumpOffset) === $bumpOffset || $this->hasXrefSubsectionAtOffset($pdfData, $bumpOffset)) { + // No startxref stanza, but caller already points to an xref table/subsection. + $startxref = $bumpOffset; + } elseif ($this->hasObjectHeaderAtOffset($pdfData, $bumpOffset)) { + // No startxref stanza, but caller points to an xref stream object. + $startxref = $bumpOffset; + } else { + // No valid startxref table was found. Try to recover from nearby xref data + // or reconstruct a minimal xref from object headers plus trailer metadata. + $recoveredXref = $this->recoverXrefWithoutStartxref($pdfData); + if (!empty($recoveredXref)) { + return $recoveredXref; + } + + throw new \Exception('Unable to find startxref'); + } } elseif (0 == $offset) { // Use the last startxref in the document $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1]; - } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) { + } elseif (strpos($pdfData, 'xref', $bumpOffset) === $bumpOffset) { // Already pointing at the xref table $startxref = $bumpOffset; - } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) { + } elseif ($this->hasObjectHeaderAtOffset($pdfData, $bumpOffset)) { // Cross-Reference Stream object $startxref = $bumpOffset; } else { @@ -919,22 +1089,79 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ } if ($startxref > \strlen($pdfData)) { - throw new \Exception('Unable to find xref (PDF corrupted?)'); + // Some malformed files contain an invalid startxref value. + // Try to recover by finding the last xref subsection header before trailer. + $trailerPos = strrpos($pdfData, 'trailer'); + if (false !== $trailerPos) { + $searchStart = max(0, $trailerPos - 8192); + $searchChunk = substr($pdfData, $searchStart, $trailerPos - $searchStart); + if ( + preg_match_all( + '/(?:^|[\r\n])([0-9]+[\x20]+[0-9]+)[\x20]*[\r\n]/', + $searchChunk, + $subsectionMatches, + \PREG_OFFSET_CAPTURE + ) > 0 + ) { + $lastSubsection = $subsectionMatches[1][\count($subsectionMatches[1]) - 1][1]; + $startxref = $searchStart + $lastSubsection; + } + } + + if ($startxref > \strlen($pdfData)) { + throw new \Exception('Unable to find xref (PDF corrupted?)'); + } + } + + // Some files point startxref to the whitespace right before the xref keyword or stream object. + $startxrefOffset = $startxref + strspn($pdfData, $this->config->getPdfWhitespaces(), $startxref); + // Be tolerant if startxref points one byte into the xref keyword ("ref"). + if ($startxrefOffset > 0 && strpos($pdfData, 'xref', $startxrefOffset - 1) === $startxrefOffset - 1) { + --$startxrefOffset; + } + // Some malformed files point startxref a few bytes after the xref keyword. + $nearXrefWindowStart = max(0, $startxrefOffset - 64); + $nearXrefWindowLength = $startxrefOffset - $nearXrefWindowStart + 8; + if ($nearXrefWindowLength > 0) { + $nearXrefChunk = substr($pdfData, $nearXrefWindowStart, $nearXrefWindowLength); + $nearXrefPos = strrpos($nearXrefChunk, 'xref'); + if (false !== $nearXrefPos) { + $nearXrefOffset = $nearXrefWindowStart + $nearXrefPos; + if ($nearXrefOffset <= $startxrefOffset && preg_match('/xref[\x09\x0a\x0c\x0d\x20]/', substr($pdfData, $nearXrefOffset, 5)) > 0) { + $startxrefOffset = $nearXrefOffset; + } + } } + // Some malformed files point startxref to the bytes right before the xref keyword. + // Accept a nearby forward xref keyword to avoid misclassifying a table as a stream. + $nextXrefPos = strpos($pdfData, 'xref', $startxrefOffset); + if ( + false !== $nextXrefPos + && $nextXrefPos <= ($startxrefOffset + 64) + && preg_match('/xref[\x09\x0a\x0c\x0d\x20]/', substr($pdfData, $nextXrefPos, 5)) > 0 + ) { + $startxrefOffset = $nextXrefPos; + } + $xrefSubsectionAtOffset = preg_match( + '/[0-9]+[\x20]+[0-9]+[\x20]*[\r\n]/A', + substr($pdfData, $startxrefOffset, 48) + ) > 0; + // check xref position - if (strpos($pdfData, 'xref', $startxref) == $startxref) { + if (strpos($pdfData, 'xref', $startxrefOffset) === $startxrefOffset || $xrefSubsectionAtOffset) { // Cross-Reference - $xref = $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets); + $xref = $this->decodeXref($pdfData, $startxrefOffset, $xref, $visitedOffsets); } else { // Check if the $pdfData might have the wrong line-endings $pdfDataUnix = str_replace("\r\n", "\n", $pdfData); - if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) { + $startxrefUnixOffset = $startxref + strspn($pdfDataUnix, $this->config->getPdfWhitespaces(), $startxref); + if ($startxrefUnixOffset < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxrefUnixOffset) === $startxrefUnixOffset) { // Return Unix-line-ending flag $xref = ['Unix' => true]; } else { // Cross-Reference Stream - $xref = $this->decodeXrefStream($pdfData, $startxref, $xref, $visitedOffsets); + $xref = $this->decodeXrefStream($pdfData, $startxrefOffset, $xref, $visitedOffsets); } } if (empty($xref)) { @@ -944,6 +1171,124 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ return $xref; } + /** + * Attempt to recover xref/trailer data when no valid startxref stanza exists. + */ + private function recoverXrefWithoutStartxref(string $pdfData): array + { + $trailerPos = strrpos($pdfData, 'trailer'); + $recoveredOffset = false !== $trailerPos + ? $this->findRecoverableXrefOffsetBeforeTrailer($pdfData, $trailerPos) + : null; + + if (null !== $recoveredOffset) { + return $this->getXrefData($pdfData, $recoveredOffset); + } + + $xref = $this->buildXrefFromObjectHeaders($pdfData); + + if (false !== $trailerPos) { + $this->fillRecoveredTrailerData($xref, $this->getTrailerChunk($pdfData, $trailerPos)); + } + + if (empty($xref['xref'])) { + return []; + } + + if (!isset($xref['trailer']['size'])) { + $xref['trailer']['size'] = \count($xref['xref']) + 1; + } + + return $xref; + } + + private function hasXrefSubsectionAtOffset(string $pdfData, int $offset): bool + { + return preg_match( + '/[0-9]+[\x20]+[0-9]+[\x20]*[\r\n]/A', + substr($pdfData, $offset, 48) + ) > 0; + } + + private function hasObjectHeaderAtOffset(string $pdfData, int $offset): bool + { + return preg_match('/^[0-9]+[\s]+[0-9]+[\s]+obj/i', substr($pdfData, $offset, 32)) > 0; + } + + private function findRecoverableXrefOffsetBeforeTrailer(string $pdfData, int $trailerPos): ?int + { + $searchStart = max(0, $trailerPos - 8192); + $searchChunk = substr($pdfData, $searchStart, $trailerPos - $searchStart); + $lastXrefPos = strrpos($searchChunk, 'xref'); + + if (false === $lastXrefPos) { + return null; + } + + $candidateOffset = $searchStart + $lastXrefPos; + $candidateChunk = substr($pdfData, $candidateOffset, 96); + if ( + preg_match('/xref[\x09\x0a\x0c\x0d\x20]/', $candidateChunk) > 0 + && preg_match('/xref[\s]*[\r\n]+[0-9]+[\x20]+[0-9]+[\x20]*[\r\n]/A', $candidateChunk) > 0 + ) { + return $candidateOffset; + } + + return null; + } + + private function buildXrefFromObjectHeaders(string $pdfData): array + { + $xref = ['xref' => [], 'trailer' => []]; + if ( + preg_match_all('/([0-9]+)[\x20]+([0-9]+)[\x20]+obj\b/i', $pdfData, $objMatches, \PREG_OFFSET_CAPTURE) === 0 + ) { + return $xref; + } + + foreach ($objMatches[0] as $i => $fullMatch) { + $objNum = (int) $objMatches[1][$i][0]; + $genNum = (int) $objMatches[2][$i][0]; + $xref['xref'][$objNum.'_'.$genNum] = $fullMatch[1]; + } + + return $xref; + } + + private function getTrailerChunk(string $pdfData, int $trailerPos): string + { + $trailerEnd = strpos($pdfData, '%%EOF', $trailerPos); + if (false === $trailerEnd) { + $trailerEnd = min( + \strlen($pdfData), + $trailerPos + 4096 + ); + } + + return substr($pdfData, $trailerPos, $trailerEnd - $trailerPos); + } + + private function fillRecoveredTrailerData(array &$xref, string $trailerData): void + { + if (preg_match('/Size[\s]+([0-9]+)/i', $trailerData, $matches) > 0) { + $xref['trailer']['size'] = (int) $matches[1]; + } + if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailerData, $matches) > 0) { + $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2]; + } + if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailerData, $matches) > 0) { + $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2]; + } + if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailerData, $matches) > 0) { + $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2]; + } + if (preg_match('/ID[\s]*[\[]\s*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailerData, $matches) > 0) { + $xref['trailer']['id'] = []; + $xref['trailer']['id'][0] = $matches[1]; + $xref['trailer']['id'][1] = $matches[2]; + } + } + /** * Parses PDF data and returns extracted data as array. * @@ -964,8 +1309,9 @@ public function parseData(string $data): array throw new MissingPdfHeaderException('Invalid PDF data: Missing `%PDF-` header.'); } - // get PDF content string - $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data; + // Keep the original byte layout to preserve absolute xref offsets. + // Some PDFs contain bytes before %PDF- and xref offsets still target the full file. + $pdfData = $data; // get xref and trailer data $xref = $this->getXrefData($pdfData); @@ -976,6 +1322,11 @@ public function parseData(string $data): array $xref = $this->getXrefData($pdfData); } + $rootObjectRef = $xref['trailer']['root'] ?? null; + if (\is_string($rootObjectRef) && !isset($xref['xref'][$rootObjectRef])) { + $xref = $this->mergeMissingXrefOffsetsFromObjectHeaders($pdfData, $xref); + } + // parse all document objects $objects = []; foreach ($xref['xref'] as $obj => $offset) { diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 7c7fe7e68..2f848025c 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -36,6 +36,7 @@ namespace PHPUnitTests\Integration; use PHPUnitTests\TestCase; +use Smalot\PdfParser\Config; use Smalot\PdfParser\Document; use Smalot\PdfParser\Parser; @@ -111,4 +112,141 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } + public function testParseFileWithXrefTableMissingXrefKeyword(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest807-pdfjs-xref-missing-keyword.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWhenStartxrefPointsBeforeXrefKeyword(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest807-pdfjs-xref-startxref-misaligned.pdf'); + + self::assertCount(5, $document->getPages()); + } + + /** + * @see https://github.com/smalot/pdfparser/pull/795 + */ + public function testGetPagesDeduplicatesDuplicateKidsFixture(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestDuplicateKids.pdf'); + + $pages = $document->getPages(); + + self::assertCount(1, $pages); + } + + public function testParseFileWithCompressedObjRefInXrefStream(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestInvalidObjectReference.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInXrefStream(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestStartxrefWhitespaceXrefStream.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWhenStartxrefPointsNearXrefKeyword(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest794.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInVeraPdfFixture(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-vera.pdf'); + + self::assertCount(1, $document->getPages()); + } + + /** + * @see https://github.com/smalot/pdfparser/pull/797 + */ + public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-pdf.js.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWithXrefSubsectionHavingMultipleSpaces(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestXrefSubsectionMultipleSpaces.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWhenObjectHeaderIsNearXrefOffset(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestNearbyObjectHeaderOffset.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWithArrayXrefObjectReferenceInStream(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest804-pdf.js.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWithCommentsInsideXrefTable(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest805-pdf.js.pdf'); + + self::assertCount(3, $document->getPages()); + } + + public function testParseFileWithCyclicPagesTree(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest806-pdf.js.pdf'); + + self::assertCount(2, $document->getPages()); + } + + public function testParseFileWithoutNumericStartxrefValue(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest810-pdf.js.pdf'); + + self::assertCount(1, $document->getPages()); + } + + public function testParseFileWithoutStartxrefButWithTrailerRoot(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest809-pdf.js.pdf'); + + self::assertCount(1, $document->getPages()); + } + + /** + * @group linux-only + */ + public function testParseFileWithLargeFlateStreams(): void + { + $config = new Config(); + $config->setRetainImageContent(false); + $config->setDecodeMemoryLimit(8 * 1024 * 1024); + $document = (new Parser([], $config))->parseFile($this->rootDir.'/samples/bugs/PullRequest457.pdf'); + + self::assertCount(28, $document->getPages()); + } + + /** + * Ensures malformed xref streams with missing /Root xref entries still recover pages. + * + * @see https://github.com/mozilla/pdf.js/blob/master/test/pdfs/issue18986.pdf + */ + public function testMalformedXrefStreamMissingRootEntryStillParsesPage(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest812-pdf.js.pdf'); + + self::assertCount(1, $document->getPages()); + } } diff --git a/tests/PHPUnit/Integration/DocumentTest.php b/tests/PHPUnit/Integration/DocumentTest.php index 346ba6331..2a28b96bb 100644 --- a/tests/PHPUnit/Integration/DocumentTest.php +++ b/tests/PHPUnit/Integration/DocumentTest.php @@ -233,6 +233,34 @@ public function testGetPagesMissingCatalog(): void $document->getPages(); } + public function testGetPagesDeduplicatesDuplicateKidsReferences(): void + { + $document = $this->getDocumentInstance(); + + $content = '<>'; + $header = Header::parse($content, $document); + $page = $this->getPageInstance($document, $header); + + $content = '<>'; + $header = Header::parse($content, $document); + $pagesNode = $this->getPagesInstance($document, $header); + + $content = '<>'; + $header = Header::parse($content, $document); + $catalog = $this->getPDFObjectInstance($document, $header); + + $document->setObjects([ + '10_0' => $page, + '20_0' => $pagesNode, + '30_0' => $catalog, + ]); + + $pages = $document->getPages(); + + $this->assertCount(1, $pages); + $this->assertSame($page, $pages[0]); + } + /** * @see https://github.com/smalot/pdfparser/issues/721 */ diff --git a/tests/PHPUnit/Integration/PageTest.php b/tests/PHPUnit/Integration/PageTest.php index 33751e599..496a280fe 100644 --- a/tests/PHPUnit/Integration/PageTest.php +++ b/tests/PHPUnit/Integration/PageTest.php @@ -147,6 +147,7 @@ public function testGetText(): void /** * @group memory-heavy + * @group linux-only * * @see https://github.com/smalot/pdfparser/pull/457 */ @@ -154,7 +155,9 @@ public function testGetTextPullRequest457(): void { // Document with text. $filename = $this->rootDir.'/samples/bugs/PullRequest457.pdf'; - $parser = $this->getParserInstance(); + $config = new Config(); + $config->setRetainImageContent(false); + $parser = $this->getParserInstance($config); $document = $parser->parseFile($filename); $pages = $document->getPages(); $page = $pages[0]; diff --git a/tests/PHPUnit/Integration/ParserTest.php b/tests/PHPUnit/Integration/ParserTest.php index 046bf4317..4ec738b0d 100644 --- a/tests/PHPUnit/Integration/ParserTest.php +++ b/tests/PHPUnit/Integration/ParserTest.php @@ -54,6 +54,7 @@ protected function setUp(): void * Notice: it may fail to run in Scrutinizer because of memory limitations. * * @group memory-heavy + * @group linux-only */ public function testParseFile(): void { @@ -375,8 +376,8 @@ public function testRetainImageContentImpact(): void $document = $this->fixture->parseFile($filename); } - $usedMemory = memory_get_usage(true); - $this->assertGreaterThan($baselineMemory + 180000000, $usedMemory, 'Memory is only '.$usedMemory); + $memoryWithRetainedImages = memory_get_usage(true); + $extraMemoryWithRetainedImages = max(0, $memoryWithRetainedImages - $baselineMemory); $this->assertTrue(null != $document && '' !== $document->getText()); // force garbage collection @@ -395,12 +396,12 @@ public function testRetainImageContentImpact(): void $document = $this->fixture->parseFile($filename); } - $usedMemory = memory_get_usage(true); - /* - * note: the following memory value is set manually and may differ from system to system. - * it must be high enough to not produce a false negative though. - */ - $this->assertLessThan($baselineMemory * 1.05, $usedMemory, 'Memory is '.$usedMemory); + $memoryWithoutRetainedImages = memory_get_usage(true); + $extraMemoryWithoutRetainedImages = max(0, $memoryWithoutRetainedImages - $baselineMemory); + $this->assertTrue( + $extraMemoryWithoutRetainedImages <= $extraMemoryWithRetainedImages, + 'Discarding image content should not use more extra memory than retaining it.' + ); $this->assertTrue('' !== $document->getText()); } diff --git a/tests/PHPUnit/TestCase.php b/tests/PHPUnit/TestCase.php index 08d4739a7..bb40dfc39 100644 --- a/tests/PHPUnit/TestCase.php +++ b/tests/PHPUnit/TestCase.php @@ -57,6 +57,19 @@ protected function setUp(): void $this->rootDir = __DIR__.'/../..'; } + protected function tearDown(): void + { + $this->fixture = null; + $this->rootDir = null; + + \gc_collect_cycles(); + if (\function_exists('gc_mem_caches')) { + \gc_mem_caches(); + } + + parent::tearDown(); + } + protected function getDocumentInstance(): Document { return new Document(); diff --git a/tests/PHPUnit/Unit/MemoryLimitTest.php b/tests/PHPUnit/Unit/MemoryLimitTest.php new file mode 100644 index 000000000..53088ec18 --- /dev/null +++ b/tests/PHPUnit/Unit/MemoryLimitTest.php @@ -0,0 +1,46 @@ + + * + * @date 2026-04-24 + * + * @license LGPLv3 + * + * @url + */ + +namespace PHPUnitTests\Unit; + +use PHPUnitTests\TestCase; +use Smalot\PdfParser\RawData\MemoryLimit; + +class MemoryLimitTest extends TestCase +{ + /** + * @dataProvider toBytesProvider + */ + public function testToBytes(string $input, int $expected): void + { + $this->assertSame($expected, MemoryLimit::toBytes($input)); + } + + /** + * @return array + */ + public static function toBytesProvider(): array + { + return [ + 'gigabytes' => ['1G', 1073741824], + 'megabytes' => ['256M', 268435456], + 'kilobytes' => ['64K', 65536], + 'without unit' => ['2048', 2048], + 'trimmed value' => [' 32M ', 33554432], + 'lowercase unit' => ['1m', 1048576], + 'unlimited value' => ['-1', -1], + 'empty value' => ['', -1], + ]; + } +}