From eea6e429dd86b4cd0d23808348548c7173e78698 Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:37:23 -0300 Subject: [PATCH] fix: support multi-space object headers Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- .../PullRequestNearbyObjectHeaderOffset.pdf | Bin 0 -> 2540 bytes .../PdfParser/RawData/RawDataParser.php | 27 +++++++++++++++--- .../Integration/RawData/RawDataParserTest.php | 3 ++ 3 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 samples/bugs/rawdata/PullRequestNearbyObjectHeaderOffset.pdf diff --git a/samples/bugs/rawdata/PullRequestNearbyObjectHeaderOffset.pdf b/samples/bugs/rawdata/PullRequestNearbyObjectHeaderOffset.pdf new file mode 100644 index 0000000000000000000000000000000000000000..950fb8f57140569b5abc17603fda6c0b86085e29 GIT binary patch literal 2540 zcmbtWU2fwx5N^>o0lfp^MSui_Oj5QKNpQ4??HF4$-ULaC76|%4OSGe1TNFsTQF4~v zsSmwDdyx($S(Y8O*rIwcs87R>-wfx+C%4m?P5VRZlQkMyc$Tg; zmk7_+O!Fy^MZzt7&9jw$LcSG6meP{5jaB^i%~$YzZSgFrAy^p}I%xAfyyP6=IoB*< znpyaMyU-={^Za^G><2CC(fcB2@n^2lg0Ir-qSr%da?$(Aos)S!;ZNy@SIO^RKR(2- zpW`6uy^kzwRD{JQ-@q}b*sL=Z78gARa(D=91LIy~p^;3M;oWpr+rZmJ?@8-C#Q6F7 zx&J)q3%SBH2m(wT>^L@9*y<(Itgtin&QMewQ?6o}<~kJ_GMsE7w%Q1-4$qS%Gm=}} zEKN5hyZMGQh0~;z1ca4!m@}y(I>PN&IIsC0i)<<6hGj92#*JAUHR`aD(~|ceBsY39 z)h}b$kHyBw!L&bUi4AY1NVYMT(ZK$|)76t!rw{Wp1ZjnKpAhYz_etMHr>z(Om92$X zD?@x%y0GW^KjPu)yTunQ2k~Sl9_+#iq^}n80d#ceufKkg{A0HAvKo+2p@1TX5Z@+_ z?f4IrgoK3D{|GI`EcDF45A$nChK*mQF9JnhHcxg%uiuR%>%NnxY z)^bNF-?9~Sgj*(grjY}k&?A9&H*k?Zbbhg*>XVdp;CEb!t&F)s&~c!p z;EBjI&?yAH$PfZnHxuV=FEIj?g?~sBg&qgS9*&xbEy$s*os=AGP4$|iQ(oje);vKQ zt`zWu=w|T=5R`2<3ut>ROVEmET(YvXYcvxYDGv1sH=3Zu3sPye0=nm?RC8NFv*P`; zCMIqblUvDuMM1T$)RC6D;NQeauC{9(B4{G(ZeIV87k}ghcyFTL4xZm}NSY~aS_op@ z%;#Oql^gq8Iqx9WB2QyPXg96bMO%5Xr}a8$<2BnVjy#*%{u}CDTr2-8>3cf2gARHW zzC@ZUxX~@#UWif!eOE}kJi*q4+9vFFD?G!tcGrwZ3EOLrt{JS~M1FX1bUE*CwtAgr zCUbYWS`*dzQW90`y$c7Uxf-Pt7^7>kGU*0&u&clwJF3Or^~2;UBwtzt(V<1^hdRh}-GPgi-_{rn zJlJL)lr}XR^6ws+{6lhKShrta^-l=l~0GRcNt7B2hTM8|$;wYXATM literal 0 HcmV?d00001 diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 6b1b7ea5a..7133b62ba 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -534,7 +534,7 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref protected function getObjectHeaderPattern(array $objRefs): string { // consider all whitespace character (PDF specifications) - return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().$objRefs[1].$this->config->getPdfWhitespacesRegex().'obj/'; + return '/'.$objRefs[0].$this->config->getPdfWhitespacesRegex().'+'.$objRefs[1].$this->config->getPdfWhitespacesRegex().'+obj/'; } protected function getObjectHeaderLen(array $objRefs): int @@ -567,6 +567,7 @@ protected function getIndirectObject(string $pdfData, array $xref, string $objRe throw new \Exception('Invalid object reference for $obj.'); } + $objHeaderPattern = $this->getObjectHeaderPattern($objRefArr); $objHeaderLen = $this->getObjectHeaderLen($objRefArr); /* @@ -576,9 +577,27 @@ protected function getIndirectObject(string $pdfData, array $xref, string $objRe $offset += strspn($pdfData, $this->config->getPdfWhitespaces(), $offset); // ignore leading zeros for object number $offset += strspn($pdfData, '0', $offset); - if (0 == preg_match($this->getObjectHeaderPattern($objRefArr), substr($pdfData, $offset, $objHeaderLen))) { - // an indirect reference to an undefined object shall be considered a reference to the null object - return ['null', 'null', $offset]; + if (0 == preg_match($objHeaderPattern, substr($pdfData, $offset, 33), $headerMatches)) { + // Some malformed files have slightly inaccurate xref offsets. + // Try to recover by locating the expected object header nearby. + $searchStart = max(0, $offset - 64); + $searchLen = 192; + if ( + preg_match( + $objHeaderPattern, + substr($pdfData, $searchStart, $searchLen), + $headerMatches, + \PREG_OFFSET_CAPTURE + ) > 0 + ) { + $offset = $searchStart + $headerMatches[0][1]; + $objHeaderLen = \strlen($headerMatches[0][0]); + } else { + // an indirect reference to an undefined object shall be considered a reference to the null object + return ['null', 'null', $offset]; + } + } else { + $objHeaderLen = \strlen($headerMatches[0]); } /* diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index c13759770..ac3a52c75 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -334,6 +334,9 @@ public static function provideRawDataRegressionFixtures(): iterable yield 'pr799 xref subsection with multiple spaces' => [ 'rawdata/PullRequestXrefSubsectionMultipleSpaces.pdf', ]; + yield 'pr800 object header with multiple spaces (nearby xref offset)' => [ + 'rawdata/PullRequestNearbyObjectHeaderOffset.pdf', + ]; } /**