diff --git a/samples/bugs/rawdata/PullRequestNearbyObjectHeaderOffset.pdf b/samples/bugs/rawdata/PullRequestNearbyObjectHeaderOffset.pdf new file mode 100644 index 000000000..950fb8f57 Binary files /dev/null and b/samples/bugs/rawdata/PullRequestNearbyObjectHeaderOffset.pdf differ diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 6b1b7ea5a..7133b62ba 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -534,7 +534,7 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref protected function getObjectHeaderPattern(array $objRefs): string { // consider all whitespace character (PDF specifications) - return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/'; + return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().'+'.$objRefs[1].$this->config->getPdfWhitespacesRegex().'+obj/'; } protected function getObjectHeaderLen(array $objRefs): int @@ -567,6 +567,7 @@ protected function getIndirectObject(string $pdfData, array $xref, string $objRe throw new \Exception('Invalid object reference for $obj.'); } + $objHeaderPattern = $this->getObjectHeaderPattern($objRefArr); $objHeaderLen = $this->getObjectHeaderLen($objRefArr); /* @@ -576,9 +577,27 @@ protected function getIndirectObject(string $pdfData, array $xref, string $objRe $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset); // ignore leading zeros for object number $offset += strspn($pdfData, '0', $offset); - if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) { - // an indirect reference to an undefined object shall be considered a reference to the null object - return ['null', 'null', $offset]; + if (0 == preg_match($objHeaderPattern, substr($pdfData, $offset, 33), $headerMatches)) { + // Some malformed files have slightly inaccurate xref offsets. + // Try to recover by locating the expected object header nearby. + $searchStart = max(0, $offset - 64); + $searchLen = 192; + if ( + preg_match( + $objHeaderPattern, + substr($pdfData, $searchStart, $searchLen), + $headerMatches, + \PREG_OFFSET_CAPTURE + ) > 0 + ) { + $offset = $searchStart + $headerMatches[0][1]; + $objHeaderLen = \strlen($headerMatches[0][0]); + } else { + // an indirect reference to an undefined object shall be considered a reference to the null object + return ['null', 'null', $offset]; + } + } else { + $objHeaderLen = \strlen($headerMatches[0]); } /* diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index c13759770..ac3a52c75 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -334,6 +334,9 @@ public static function provideRawDataRegressionFixtures(): iterable yield 'pr799 xref subsection with multiple spaces' => [ 'rawdata/PullRequestXrefSubsectionMultipleSpaces.pdf', ]; + yield 'pr800 object header with multiple spaces (nearby xref offset)' => [ + 'rawdata/PullRequestNearbyObjectHeaderOffset.pdf', + ]; } /**