From e5eae4eb9de6f994ddae34552f5071d588ac6f49 Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Thu, 23 Apr 2026 23:55:34 -0300 Subject: [PATCH 01/10] fix: preserve absolute xref offsets with pre-header bytes Some PDFs include bytes before the %PDF- header while still using absolute xref offsets from the beginning of the file. The parser trimmed data before %PDF-, which shifted offsets and caused xref lookup failures. This manifested as an Invalid object reference error in the veraPDF corpus header case. Changes: - Keep original byte layout in RawDataParser::parseData - Add stricter trailer key matching for /Size /Root /Encrypt /Info /Prev - Add defensive handling in xref stream resolution when startxref is near, but not exactly at, the xref stream object - Add regression fixture and integration test Regression fixture: - samples/bugs/PullRequestInvalidObjectReference.pdf Test: - DocumentIssueFocusTest::testParseFileWithCompressedObjRefInXrefStream Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- .../PullRequestInvalidObjectReference.pdf | Bin 0 -> 9393 bytes .../PdfParser/RawData/RawDataParser.php | 44 ++++++++++++++---- .../Integration/DocumentIssueFocusTest.php | 7 +++ 3 files changed, 42 insertions(+), 9 deletions(-) create mode 100644 samples/bugs/PullRequestInvalidObjectReference.pdf diff --git a/samples/bugs/PullRequestInvalidObjectReference.pdf b/samples/bugs/PullRequestInvalidObjectReference.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9d15f2474e107c6029772e2ef2107e7d59458dac GIT binary patch literal 9393 zcma)i2|U!__r9boC9O)KLAETj7;BcsGPdkWLN#UvlUXz~vbL)1icmC`>^qfxjqH*& zAt7Zi`(E^W4{80p{@?#F#{1lR&pqedbI$8N42W2 z=D;`R=wYcS44?ocz5cAM%!Vam==$k8{@iC%JsTK_BU7j#nGFqUBu63@1d?F`pF;hV z{?bF=byE+Z0We67M8x2zI1&*gBD(3r#>j@LY+!XPg}NazB$I4#cq~YEV=ogBSku(x z0uT`dbko0|>a;O{pEw&FmJAY6bEHs71VFtp))q&hl3nys1T09zj!Jb<0E3;JoFvgC zJc%qxBHIF!w23w(kO;+CQx&AGp(X>BmywZxNkT>0ez`#p*QOgAVP6j;8+gNgz#3f~ z1_-ho-B7FF7SiMT=fcg9vw=+(l?j7z7$spdX+4OZgBsQjr{zkCA73C~JqzJ6*_@M%oSoE7G8>`oM{!O|WQi8)z2Zn%Na!^p)@XC}#!YlD_T5SqsJ4n?LhsR*a{J*AQ)lpQe5L)vmk(npCtvfIwfHzdcm->vC|vhM%8V$Kzf7wS-LnX(1t`d z&x}exGm0aQiUr9?KqX)jR0vc8ummrG0{Y2Df0U=Ug3Sg2lK#;jp^y!+A&Lx?!H-t` ztIyCI-R7P@8UjpO77o@2NzwJN5cD55PEWB~ypPLKLupX;q? zvzcgl(?VTLO`JGuu9NB_?}uc)M6+Am$l88i8jlL(VKQRTX4$e=RjYFwLVeqQ4fWe8 zA+Qbt6=|RmA(??r-_>z-taa-=WNbt(oAYfz#6daD2z}dxw#lQDi}H(3laufJN){t# z8KQ;bQ*z!wb};yNL^tX9RdF8JZym?Vu$2wOxQ7Aek}fH6f{(dBs9HDcTo3Cv-^laK z_i-isr|MroroMYiyd7m{ILRf)EE1eUrk%K)ks>>CYJrR95+5RT*J$7PG}Gl=_|r>~ zCByDR^4*ne{%3AJ7qyL0jfG6(Xl8G@LgWOJ<(%Fmhs?`esOpNZPcr)Wt!IC#QT?SY zCr5>f5Tf9kE)Ls5WntZ0vy2S2)Hf$AS9TomvgF)p;)|X^-bW&EQ$In` z@zmh$9uPI5%;3=jkK)e7KENH>@~CW^uBI; z@TwpZ@ofi*pfYTx#CF4(GqZ6eFSk{K!7r2P#mtb!SK1jpw{q`!exV`soi}I*zw*cg*25vHxze?>e^A0Iu>jKOddBC9{(` zOLB%`CT|DBU@L=hHhbUA-Z_boAZFNGZ#_+{D~l!{l-pv@q=)-Hzzkj_$CeVW zHO-kbGDb5B^=}co$U(b#VmIUU-Ha6oxEFIBi09J{#;pCd2>E6Pr6gvFXddcK-Yi}` z>y)MguRCkTO^b_rPkDG=+^)DS6U4%g*y**y9I@-Z*r#pF zYStoqeLRhyh$k@|ziE1(6Q(NB%$CGF=6UxCPcw03vDs*ZiMp$h>FPE;1V{8_^g{?^ z8;TdtTvT3~btvGbNk6wUYuYXQexdw*=b1UmW2cmtAnuIH92Y!K+Ffbz2Uy~Er+uYe9`c1 zu^K_m!CkjSsM7 zP>@4JKRYDf%NJ*BIveGxmcpld{OpI&LeeIWa>D!{@yHb67anP&3SsC+x?y^+5JP-J z8bbn{cLuA2vM_09I#|X<#<$ytHe!7{NAc`M4@*3+23(C&8_2&rAkU^RJiPU_=xZy< z3dbRxeDVa_#G%EVPT`uLSe=c}Z_D&twCH6bRnySCc9FKd7nQBoyqAw1{ZKR=|Lyp- zqu)W_6~4#ciO}%7(`xNsb3QZ+Pp@HKK4;Afd`Er9on3(2@y@l|rc37CwTo`$S2|wzO19|R=2cAuzZY;T zy_$XX&gzZT#03uS{9S>&E_rV8Y~o(?2}48A9+0^mQEkIb;CaC9A8>+OnEQe}D~%kX zK-(qv{&IqXZ$tx)nP#BSlrNVbSCF0m*|A_$=kkG~&?4d$rSX+ldi|l3x%GseFQqSx zK?mTh3v9$9sh-##`yS2rnXbE)DtSYAEtE>CLW z-gA>ilSfnwA;`1BJz}((4`1U}s7rod6&D<*CFb7uyLBw)T)2NBr0-Fm#2(FH@v+@J z`>qa8j)E7DPL~~i!J@@tw?l646d~Fy8s=cpALWscz#&p^>2F^ZkUMtvXtBx?!b?|6 zyVi|Qwo>u(-F&leE+TwznQV!7F@bwCc9x7)SLw!Cif8Hs3Fv{tk}ZoN>Y7%=bqrsmrIuc!73nG4DD zdR7WnJJ#q`QU)j=JwKM|JLnh+e%5=Wy;BFTuNbMWrK8<_pQ?6AI7HV;*EUEk=%HYg z{*y#Z;?6|(2V-F=21=pZ69p3qr;19N9iLqJ;H;qaFoATc>;YeGQr(i85L8Nla%>iC z9+MWD_Pnd-^T21NE(3}+Wra)}EF3;eF&Ki*`pjGzR%*D{FxeDk`KidRD`s|E$F{Ls z0(asniYp@SJoo*0kaSMbYQJgwY3fz)i`p-PD7xtn&q%DsJu(*;Q#MrN&)&r`2aFMDy9}z%_HY3GoZmoU4v%^MVl%GjwIm_nmp3Y=ex7=`-K) zMlxB+y8nZU$w9w~>X7Q>!rH>X!r^(BdC$Y0?A>>F_(%HNAD)yJgs&!W5kh67@R#v8 zQrJ1p=Xg!A_H*5?&l(HTpA%eN24<|jv@g^zs5mY;&N;c4@n5fcs$cYaFv7`9D5K{c zwj+MB@ab{)Hw$k@F3pd5OK-8yO>=plKC%{5lWChZ?0Md0@}Rwjy=TW-eg^)%`UQu+ z-1*A+?)kT}>V(cWO%tTCvLYzq^t=y-=ioWykfu*x5Z-Z zQGVF5u=`=(!tKJ}JXU?2N#mw@(-tEz5v`FYBQv6SqI{!PqH)okPc)z8#t6lP#%_ygGdG=*{rBUVUD&4BuYGn1x z8bnQ5tx|1qokCqfygG@v2E9qA`jna)duVr56Hpw>SH_JB{wkWogwyLz2x2d;P zw`;dIywQKt(s8<@^R4CE{!UEiNEg0qy8BA^*LOF1w)ObD-~B$cm%BHnPqZ(yU#7q0 zgZhV8111CSKB7O4f1-T)Hh6o8eJFJJ=y37~Y@~P;IodpCF*Z2vF#h%P-7kB-&?ZDC zvL{bYzM48aH8@S4{yyV3%Qc%g2cN5$H=OTXAS`@c^jYFsN?Mj%u30f#8UE_@jrrT7 zRngVLHQlwI?}YE)8Fw-)-k$yXA2(h)dfmKzs*yCfXgd@c1?VOJW11ludj$mD2<9&b z*Mqrf+7E01mjT8%ulJYPRZZEY0T=+hO)>1?jToBRu#0G6@pux*j6}v`L_wyP*fmVq zjsD~K*CYOO2Y}5{155-#p>Q^^DvE+#KOC^Bo{p)usu+@t#p;oWBvld~vw6Vu32a~u zA^=QqL|YKp3`ax|DY#9+Z-j>_*%3=m3N~O4hH3{8Co({W{qx24j|2qy|CaqPjlYfm zqmw=cy#J7I&fk;+pFrl*ecmJoNIal}&V29=d%10yzuZM7urZ;~}K%rOm``$i$?^$mF7-x8KnHCe!{s z2fc5_u!tj1ZZkdUk_OV88!624NgehQsOx{m=wkrgR&2Ym};O z_}Nz%{7rmqgWoyLxOw}99Ln?vfjIH$f{m4t_v%Vh@ak9hhj%CY@m{2K^5@=GdU+mo zq)R3@;)~7XQ;x31&&N(Qo|uJ4)_G1_EZd!bB|Mv49Hdfza7x1+dOcL7$!wa({iw9)}!lH2L%WB4c;IdE*tTrPMlUTR5>&L=6P-&!Hv+@b#3Bo zRH68MLHqN=9oLlb$l|4dM78J8R@Few0HuaW<^j2Yfr;;AQK< zr7LU7eNAPCETM;WWY3=ClZe=%dr&krWVO}U_+ZM{_IvHCmpllugd#%O7fj7+GLx>^ z7|*r?%9FKO%2 zq`kA1meX0zT8bhQ2N%7=tnVi6ck(__m2T~I=otBqWKfmHfGT3>{Hzrvv7oW%5)s-5 z>x0XNy%}<+WXrd5qjt%z>WPI%l3Y`8ACmPn&nrJ$s7>jw=tq~Q^lvTiD*x)SxJ~Sq z6Z0fnz|LK}IJZlAaFz2S%-(xCPQE2%zB`i_KT%isG)JXl&G0MXOJMiQoSWST?ajE? zMvq80ACfy_$3kPJ?P_P%X9{56wJmphE)!?vqJNVQAuH@j(puS4mrL4d=q0R_059?K z%Et>MdUs5peQ2*?NbEKe&tVR(Y`L@FfJgMEWQ=~W)?q;-nANF=&GGGAc7dwN!FQjd z?|abNLo7cf5UE^N`n+TMqwOBBB;olOv0B}*V2u*d$18>x#xz%P41>)fpJH!P4_N6H z1Z%!+&Y|*H8F+|95u+K8$MF{iZg1NSoNQp!-4=(!FyfOvR5AZJBX~G5777m~CR&fE z+)J_BqNJ}oV(xnLOxy8uYWAZ|bFdQCaU2f->P8)1{bgE(Ts?YYWN|wPvFX_!xo!I0 z@fop2Nx{`^#~imdMn6oV38i|D@D{2afJcR<>QVFjyX5rZ+}=dBOF7sJo{3OhoQ0aI zO*X$YRc&jh;6*^ikqNE%`O}9r?={EH?bOADuf!A@Tcwt!Wf+^ay~}-{{dnFiFZn_W zDcJ&VOYk3a4NuN6vTuKDCpmFPevU~AY0zrz>ciP$oKk(xplvq$R%C3H^In7a6U*@@ z`z7H!L}(Qcrh5w2$X-k2EmRi&tW-yD+Hk{Tp+oMs{r052$Q*o&mep=L{KEA_&MA_$)6hsn zh|&$gd@PnV`&IAj`^~9>)yHZPcthg1;pu_`DGE=aQp1ybO(9oP1F@d@bKb%_C8CGj znmYtE#H_Me2qccj%MPN@%ZJAUUKZjEE5eN~=i2HIzMWT?uRAqy%C-^1;)moqAzeu@ep>2YAGYg+ipH z>ltO)?JiZti$~{sGvlc(;ny-=6+B5YD63#(OQ!@Cb@9?-B-Jx2cbMDFoFW;sNd>`5 z&NN^#5YLDrv1+GuXV=m889|${kAc1SDh01Vg{7Q4`P%LdjXtbVhCPY9v{p$J@C#;t zc;Q3N0`oExe+>6|Wc_9V=sRcQJ_^2K-FDS5KFNTg+D^BR%B)5)A^Z8Ak%-cD_ z=}XyVAdK_Al59!t{3I?r?o!h_22M#$w32ddFE)zpj9jkzn37T*@=8qN^V__nq++dV zZ5g2>FRnDm1wVg*%$Gp=wleE$u;7%duejcg7ZQpSwsijzT{AoL>iWr|wbwm8DYWs< z32`;uFWpyP?A2}97jL%M{PJuSOo~}ji+az$DS7xyNNnhe+l0Q#!Q{ug!mbY_w#y%K zL%^#=3rBPXmc0_xtQl3_uz^ok7Vud?`77rRye{F$mts0grCFT}v2(it=2h?u)#!SW z&Ny`}_*&ss@^~^ciy89Hz@e!&hMGDB^8ig<+HUrft#WnBegOw+b`L%s88#6>HVTz3 zam6!#d^VkMvx{9mcW;gX?S*BB)K-Jp!OtBATiKbrrEgs|c2hxqQtM{HM1o&4Htf_r zk!@q~eCz2m{pQ@w%aCRViN;&R3m&_ySJ6h);x3>3toPUWacZ}92L1Q&vpXSabE=Oj z4Nzj@Gk5CHUAr~v8pZ57UQI;^SG2yi3wZDqbGziL1yQ@BxXze#L6=jd|8B!wgY?B` zam1FMEdqx{isM;`hYpw|cHEu-va(u#J{!1C{jjgBlY?)4Jf77~UnlC(VSH~r{m*a6B&z>$e z>=Cr=9fCSH&EyOok$@K-IMP@|=5{+geT91fIuJtTdV=h<4-EWL@da)Cy;*ti&LZDH zg9qv;vcqSosr!Uo&o^T5_m^pPEYR>KAHG;+l{9Z@k3ny3c;oR8qQ0SOGP@_Sp2=FF z+oIiu7CAu91~E8_6so5 zwt|yK?jI~VR>BZ`?oRLVS=hzuy(_|poCP{sP*8L6z3-~5KlO|!pN;Vfz`vUD$f>#J z{OwWYZedrx?`N+W-V_iH3|z*>oka_mKj7nVZaylp$8Imb)q~K}s^NSAtr1C?FN^%+ zd?p>-_J5SH|9*mRxbXZp{#9Pj69~5E`sv=5kviB8b5r79YSqgU1bsy$V5^ zdAZU~=+VkgjJ~K#n^L}~HaDn?Pp&l6=Q(D6C>@b0=l7YRy$y=gR1QOatTal)R6XJe zOXd*}p!p?Uqy=Z&6i;8oST6eKp=XjpGc~4Jqp}_#< z+@tIMm+URAYm0&gEeCe`CSs#)jjnU@cNX|nJ@*aNy^i!(L|VHjxyn)oI3S_4DxJTbgLto7X`dbF1j>Q!_@@#V1^!h#7?S< ziorb9Kc=`mmOs{gWXOnC2bD7)aeL|kAtK{vLWUYnWSn)F9XZWTYxU`hBq&|b_xqHDkGc9fV(jI}*>bIY* z-E$<7w3TVXk$WW5d&P?i{X0dxu&Id*zKY~Vt^UBN(T7{f(=TKnxtOb1!VXO5rbcYoPPw?wt69#L{KB>TW+3R?lF z^c&{5=bW}^Irmi{#R6IGS-G2z|BxO;OEa5NJd&(N*Y8T`SGFyW%>(5{3(M%0kHeG zOdg<+{*p<_15ouZ85{tmf61V52*4oyIS(og1vsuhWpEe-0C#`Mq~!lu3x~`6E0duE z*uU1w{_C4GMCKpgq@nPCY>|e+|M6EE4*w@s($cWs_GM?Ik^zv9B?JF2qR0*aZJaB1 z{Q$tmB!G2VC*c6GwmQf{QC&t2DG!rXS3}Cd;BYuZMovRjMjjz84}~M;AhHl;kRk$x zkd>B_N6M!xgp4X2DhHF2mY0F4D_gMx(3&0Kod7m#o{ua5 literal 0 HcmV?d00001 diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ec8d01e53..025d651a7 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -198,16 +198,16 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [], // get only the last updated version $xref['trailer'] = []; // parse trailer_data - if (preg_match('/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Size[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { $xref['trailer']['size'] = (int) $matches[1]; } - if (preg_match('/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2]; } - if (preg_match('/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2]; } - if (preg_match('/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailer_data, $matches) > 0) { $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2]; } if (preg_match('/ID[\s]*[\[][\s]*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailer_data, $matches) > 0) { @@ -216,7 +216,7 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [], $xref['trailer']['id'][1] = $matches[2]; } } - if (preg_match('/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { + if (preg_match('/\/Prev[\s]+([0-9]+)/i', $trailer_data, $matches) > 0) { $offset = (int) $matches[1]; if (0 != $offset) { // get previous xref @@ -246,7 +246,28 @@ protected function decodeXrefStream(string $pdfData, int $startxref, array $xref { // try to read Cross-Reference Stream $xrefobj = $this->getRawObject($pdfData, $startxref); - $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefobj[1], $startxref, true); + $xrefObjRef = isset($xrefobj[1]) && \is_string($xrefobj[1]) ? $xrefobj[1] : ''; + $xrefObjOffset = $startxref; + + // Some malformed files have a startxref that points near the xref stream object. + // Try to recover a nearby valid object header instead of failing hard. + if (0 === preg_match('/^[0-9]+_[0-9]+$/', $xrefObjRef)) { + if ( + preg_match('/([0-9]+)[\x20]+([0-9]+)[\x20]+obj/i', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $startxref) > 0 + && ($matches[0][1] - $startxref) <= 64 + ) { + $xrefObjRef = (int) $matches[1][0].'_'.(int) $matches[2][0]; + $xrefObjOffset = $matches[0][1]; + } + } + + if (0 === preg_match('/^[0-9]+_[0-9]+$/', $xrefObjRef)) { + // Could not resolve a valid xref stream object reference at this offset. + // Keep already collected xref data instead of aborting parsing. + return $xref; + } + + $xrefcrs = $this->getIndirectObject($pdfData, $xref, $xrefObjRef, $xrefObjOffset, true); if (!isset($xref['trailer']) || empty($xref['trailer'])) { // get only the last updated version $xref['trailer'] = []; @@ -607,11 +628,15 @@ protected function getObjectVal(string $pdfData, $xref, array $obj): array if (isset($this->objects[$obj[1]])) { // this object has been already parsed return $this->objects[$obj[1]]; - } elseif (isset($xref[$obj[1]])) { + } elseif (isset($xref[$obj[1]]) && $xref[$obj[1]] > 0) { // parse new object $this->objects[$obj[1]] = $this->getIndirectObject($pdfData, $xref, $obj[1], $xref[$obj[1]], false); return $this->objects[$obj[1]]; + } elseif (isset($xref[$obj[1]]) && $xref[$obj[1]] <= 0) { + // Compressed object references are resolved later from object streams in Parser::parseObject(). + // At raw parsing stage, treat unresolved references as null instead of throwing. + return ['null', 'null', 0]; } } @@ -964,8 +989,9 @@ public function parseData(string $data): array throw new MissingPdfHeaderException('Invalid PDF data: Missing `%PDF-` header.'); } - // get PDF content string - $pdfData = $trimpos > 0 ? substr($data, $trimpos) : $data; + // Keep the original byte layout to preserve absolute xref offsets. + // Some PDFs contain bytes before %PDF- and xref offsets still target the full file. + $pdfData = $data; // get xref and trailer data $xref = $this->getXrefData($pdfData); diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 7c7fe7e68..e192a9170 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -111,4 +111,11 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } + + public function testParseFileWithCompressedObjRefInXrefStream(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestInvalidObjectReference.pdf'); + + self::assertSame(1, count($document->getPages())); + } } From 917ad5d7ea24781521c44c10efa6cd2f5a4a497e Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:04:37 -0300 Subject: [PATCH 02/10] test: use assertCount for page count assertion Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- tests/PHPUnit/Integration/DocumentIssueFocusTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index e192a9170..54a9dfbd2 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -116,6 +116,6 @@ public function testParseFileWithCompressedObjRefInXrefStream(): void { $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestInvalidObjectReference.pdf'); - self::assertSame(1, count($document->getPages())); + self::assertCount(1, $document->getPages()); } } From b8ec7b322fe781da52228eb081d0123d51f87ba7 Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Sat, 25 Apr 2026 18:34:02 -0300 Subject: [PATCH 03/10] test: move PR796 regression to RawDataParserTest --- .../PHPUnit/Integration/DocumentIssueFocusTest.php | 7 ------- .../Integration/RawData/RawDataParserTest.php | 13 +++++++++++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 54a9dfbd2..7c7fe7e68 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -111,11 +111,4 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } - - public function testParseFileWithCompressedObjRefInXrefStream(): void - { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestInvalidObjectReference.pdf'); - - self::assertCount(1, $document->getPages()); - } } diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index 515734c71..4784595f4 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -37,6 +37,7 @@ use PHPUnitTests\TestCase; use Smalot\PdfParser\Config; +use Smalot\PdfParser\Parser; use Smalot\PdfParser\RawData\RawDataParser; class RawDataParserHelper extends RawDataParser @@ -315,4 +316,16 @@ public function testGetXrefDataTracksVisitedOffsets(): void $this->assertIsArray($result); $this->assertEmpty($result); } + + /** + * Ensure parser resolves compressed object references from xref streams. + * + * @see https://github.com/smalot/pdfparser/pull/796 + */ + public function testParseFileWithCompressedObjRefInXrefStream(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestInvalidObjectReference.pdf'); + + self::assertCount(1, $document->getPages()); + } } From 66b4daf64890299032ad537886a468622bdd068f Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Thu, 23 Apr 2026 23:59:46 -0300 Subject: [PATCH 04/10] fix: allow startxref offset to include leading whitespace Some PDFs set startxref to the whitespace immediately before the xref keyword instead of the first letter of xref. The parser required an exact match and incorrectly switched to xref stream decoding, which then failed with Invalid object reference. Changes: - Skip PDF whitespace before checking startxref position - Use adjusted offset when decoding classic xref - Apply same whitespace tolerance for Unix line-ending detection - Tighten trailer key regexes to match /Size /Root /Encrypt /Info /Prev - Add regression fixture and integration test Regression fixture: - samples/bugs/PullRequestXrefWhitespaceStart.pdf Test: - DocumentIssueFocusTest::testParseFileWhenStartxrefPointsToLeadingWhitespace Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- samples/bugs/PullRequestXrefWhitespaceStart.pdf | Bin 0 -> 6499 bytes src/Smalot/PdfParser/RawData/RawDataParser.php | 10 +++++++--- .../Integration/DocumentIssueFocusTest.php | 7 +++++++ 3 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 samples/bugs/PullRequestXrefWhitespaceStart.pdf diff --git a/samples/bugs/PullRequestXrefWhitespaceStart.pdf b/samples/bugs/PullRequestXrefWhitespaceStart.pdf new file mode 100644 index 0000000000000000000000000000000000000000..718557609b254ab33923a92168582a720f6a4cf2 GIT binary patch literal 6499 zcmeHLU5p!772YI($|!Ebk4g|A-EK)YA+hiL*j|t8b-T9Xu<7n*<8?~VX3Ngb-1Sa6 zo|(*yv)-TxQ2P)B50MILB^sfY3Y8Z`rM^@}>LPyBR@oF);-TV!JRlw*5=Q5-wnkgzIc5W72CuYZ6 zN1{+Dg1GBBMCoR@$|HSSQJ*9pS|uIboSOLi>u*dTU7MQtT49c#v+HDC|LP@&EMA&# ztCyZsWo_b-YNpaJ_q%o%;)42J!*t92sc{8DLdx(=oAG!xgHXlM*2=9$GZ+MKQ{(HN zXO}VF*x1NzNIA>tU_q8;%!^nQvoIp-ZknFb&zkOoOh=#vmQZzco341EO;aOfcF!@G zDow@2AYGbbSjY)@9MKslt5(mO8V}TgLQO5J;J#-$be8^y&2^Dn7_aL{GPAL!#DoJ+ZEa0C-iM9jg7=ZrbOjY% z?=r{f5Nx`bgu}pfKQgpuQC%lpWp_!Jjd?IG2FYp@$J^Xp&<;i=L}@YDU5sXCLZqn^ zL)%>jcA23;T5w^|AVi79Df?cy@ex57Yf2~C8ikiFxve-Z@n_v4uGMo?lCTpb$zU}D zQ`LIU^{g)P4Oj5+fY@Y*m6%GGxUd1qnbu4-wl!3yHK$r#8r80%uG3nIMVzpZklxjf z?T|ISUsc_Ne9(FuNk0kFi${Hl_RNrmsMpZ{i{*{2PndgU>SMJ|U;U{kJ4PA~vLYIJ zQ89Wfw$+B>k(8Q2=N;)%K|{xKHmh`}Ii_Ny)J6Swq)i)CwW@1%35qqO6vui-g#vsI zOVS9-ejhDPL!G`Eze1GlU@xbKjn?hArZ$$ZFDh5VJJOeml|0|rP$3KsmaZMu1k=s2 zLxydOzG6d(=Tjvfik@Ar`?bAoS;o|OSi9SB2BTU}H#FkVA{PuJG$p3SMP4Xoc`?fi zZ9ysva#<9P@#9c4jmf0Ho`VYm7yrYxhak#TGilPwE)yOUBy&k$!fl3Kb&yL11i7~EP$5? zSO8d?n%JPQ0iJx^5dt_0&m#1qdjO0Mhw<`gyxgAdkuoY~i&-%%!N11~SzB@4ta^kM z3l94@Pv9vq3gJ>A0D1z&r_j2fH~;M_rCP+u~$x?Jbyvlx9=x=u3l?B z(Hi?~=lQ+b_r6S?dhh6?w_d!8I_R7CUEO~6UggJoUU>%HdKg{&)bBs@=m%f8`Y(_F z(RcS4_dcE7yZY=)kAM3Xv2Wm)kG(zn&f(L%$ny*K>|3W!p1J4e$8s~<*~kBJ{jnSP zD{tbH$7bgC$lR^!mDl%u@$&mmEnoZ6<(c>S2gbN>-SamWmCk+sw>Qvl4xraweR==e znkGJU{ls&p-`(C?S-!Eg^~-$^+&{6s^@~3|{hjH*K2)3i_`&-=d}06orN4}gU3+i7 znW3>wMwx=vaGmQ=VQN+%Ezy7#c=k_efMcnP0792A#aXxw)s>M07Po*M0Q&ODgAh;( zdR!yMAUGTz+AgY8@Yz$V=ZWgEM;-}`YgFMwsf4C0BkYHu83}1*E(enrgnThfTb6C1 zM{PfERYU2yWRR)=`-8L#ILJKsgaqH0K@PGMO{pS%8)i>51lYg@N>oPeb%FwB1&sk4 z(%?v^1k1+?D6HKNv(Yxl4Td_zHUMO}$OA=8j&T=ctwD?KMzTYbUU0}oB&2y43qUyO zfLBO-YmEJ$)$%gD0-U zj3R@R!xlP%jpFDxh_Jr!7X%F|RHVP~!rtMF_18u(M}iOf3FcY0t|DI8Ax1#~^s~%Q zxmcDZXriw#$})$zJ;N&_T37w5iqg63r$Hi-{n8pUF`}01rt8te0;xMpI@sC%q*Gaz z;n-eGCPlbUV_6X=w?a`KklX)_+j4ipZ2?8GQ~v+qws*PTfS}&xes{Sa!*h(z0~lcU z4j(P|qgRg|;-qNRUHV6{{cY&q4&3si;(rK5taOMw#qaNe8!gpM(3u^tO2Vrwb-*C$ zomp}`FdSVqh}r9|5(mSLVVwF_RJY8o9;&4Zj-CgTWsEz;WCwhRi}Rt_VHEM9A*7t) zGF%^SYBC()i4uHAA5<(9qynNn!H0q}`ax#YCgcGar`kk{tWs@K9?soVTb_plJ=rD} zB>=yvw#njE7GxpcEb)_7RFP{nxmXj61-TB(T5n2?Qk|C@lh7^6b+J}m&HzNsz#TkfpC9jn OTqconfig->getPdfWhitespaces(), $startxref); + // check xref position - if (strpos($pdfData, 'xref', $startxref) == $startxref) { + if (strpos($pdfData, 'xref', $startxrefOffset) == $startxrefOffset) { // Cross-Reference - $xref = $this->decodeXref($pdfData, $startxref, $xref, $visitedOffsets); + $xref = $this->decodeXref($pdfData, $startxrefOffset, $xref, $visitedOffsets); } else { // Check if the $pdfData might have the wrong line-endings $pdfDataUnix = str_replace("\r\n", "\n", $pdfData); - if ($startxref < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxref) == $startxref) { + $startxrefUnixOffset = $startxref + strspn($pdfDataUnix, $this->config->getPdfWhitespaces(), $startxref); + if ($startxrefUnixOffset < \strlen($pdfDataUnix) && strpos($pdfDataUnix, 'xref', $startxrefUnixOffset) == $startxrefUnixOffset) { // Return Unix-line-ending flag $xref = ['Unix' => true]; } else { diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 7c7fe7e68..bd85d8cbe 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -111,4 +111,11 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } + + public function testParseFileWhenStartxrefPointsToLeadingWhitespace(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestXrefWhitespaceStart.pdf'); + + self::assertSame(1, count($document->getPages())); + } } From edbacca746685d8f21b9b1d528e0497091fcd33d Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Fri, 24 Apr 2026 11:59:17 -0300 Subject: [PATCH 05/10] test: add pdf.js compressed xref regression Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- samples/bugs/PullRequest797.pdf | Bin 0 -> 8286 bytes .../Integration/DocumentIssueFocusTest.php | 10 ++++++++++ 2 files changed, 10 insertions(+) create mode 100644 samples/bugs/PullRequest797.pdf diff --git a/samples/bugs/PullRequest797.pdf b/samples/bugs/PullRequest797.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f3e25216d231f4b18c14e33b00bfc7877650c30c GIT binary patch literal 8286 zcmZ{qcU;p+(D0Qip$P~GD2Cn%od8m#9layHgb+FjkkEVYh9XGsMUbvYlMaG(REi*= z2m%TLrS~UzchC9WJ@0-#ncbb=?CkE$>|fuo=_o4m3GxdA*eBC%91(Cu2nq(|RFo7F5EK%?3PCXeu&|&h*KfW> zfAg*E=z_v}1}VE>*D1oF2wNBcqydB5qa1)>iK|6O6m~CHfKLwf@B&eY014iwLycsj z7ws2ZG2~%dN%$`s$uY7aMmSbv{1O_kkXiBh%qga+$6=4&!@>z~cHv~wWYt-(c4RUd z(ECoV57V)uf(%zac{OTlonGfAr(wAY_+NYc75b}?{x|ejiSmG94KDfmhem1Az2kNQ9>c6q{o4Uy6THd^P`jA8d@F`Y;sG z41|p_5M%)J!6r)86l;J>~8MTx8pyH?avbUC)*yL zu&WQhSH**0jYkK9&4eQy!6%P!vHf!=MHmw5;pldiZ^5g(VlSoa=z&BjI6ypr!ot{m zLH?SEfB_&QM_ZHw5{Qk*FH0pjHm8nodmu;?0@XvfLg0UEunz%Y2zR_nIZ*KGef(bj z62D#HuW}F*78m`yHUxnZ!a@R9=lE~={9ZHFCPY4F{f&7o&&1S?ve|~yQLY~vzDTFD zAsW~~?m0ee#+HuuZ@~pNEJpLo^))Rv>fJT`j3qSkL5_LjgW?1yC%Vj=iHUvk^G5M zCsgJ=@?NV4VU)Lvf5~Wp!rt(;^;(8Ud-%@}RtvqO2;M)D0aMO*U_4syG)C*T!QFIj zI9VBWgwJVzIb`YHDsHgDPZ1v}6qBW=LwD0J6TO#-Kkn_OD17Tqh#yja>)L*~%BxE> zixi_`QRI3%p~cOlYb*xW0Xyreh4Nx739p&4W(|D=`0QAMI;F8#$QrK#wrR$c#M zjPkj(Z&~)^q_>>Jk0U+lzY*diXesrx&cAVpbiUQWkHq9;i|7=K4VACdnUk#w*G<+U z=if={ZB5nTzIfQp?M*h~%P5A!($Za9sp|{?RA{^=WXcH*4or`Ioa!iy2Tq|=O~zg7 zd4+KAFnT7O_5kxPan$`U=WbK47jw6SaKmxlT-b&4qjb-RxX$`Zj`-eb;9n!My4eCu*KsAaa=tWjc*Fz--?! z9ld-1&~Y25BdIii_!%*zwxb!~nBgGO*T@dES3^;!`P99BBqF*Q6WDw?9$cMY^=R|t zorHr6*J4BA#P+vmhj8Ao&cMw>Xs}e^aOYMXZSmBgAy_g~#Axt5gCEp=*BHs)AYD+z4%c1MU*O+GaSTf!$&NO}wFi(uD^tmXeL2Mqr zwk5nKdw=eUs~ATn#j2InJYo1(Rw?c4(KY(P0?DM9s>KZzNN?yvT6`&K3x?L$t#Rt} zb1Wh{n~u?26f!(-;P7qT*IKlPw)j7LU>ub={@jpN62~2+H0gDJv+=B**InW`{(jcv z7!e*8>GM#Xu2+0j3KlLvCV|R`q4qByKP(;}3)|DQ+pmqg*$;eOqaDx5K~wG8%g8ZC zUZ%Mt(AV?dNTK~SE%Uu0-QMu)Qp^X*wCX?0-zJ&-pw zuOOXzAGKgCIYa23lzMqwk1@s6$qf~#uzHjW+&vbQR`;V|8p`30 zN)46$Wp>m_VI&m>8=daFb4dXS<4E{YX5_s<7E&=B?$q6us`=K)Mknfi@$_SHdZq_* zX?k!z^>0Bd$MAN(nG*JtEFNcn0}|UCTD9=Ii-$hXQyy+CEO2#HV4TlQ2_!o~B>8wW>*h7hIW{rg&2~&{1aDWQVQK7F>hBck;4Cme$`?5=~{Tcr(8< zf^iLPXcal%iiSJ7(L2OBmdOdk1#}*swXKjG>(>FALqF9LJ;lws#_<$Cu}gE1eMP^4 z>-aNjl{-U?+RAm?1MHx1p-wQ}68|u4Q$PS|x2Tad%BJ>*Zia$$TW|QZ*AG_HqOirt z)aA0Sy)zsm$4uekG@N!i-f(ecnGJsB;Ygan@Q3)zh7V*#lX2Z6b{;3?X}r2dvM9ZR z!bFXI8$QL{ZRK5nd?iIoeivUl>oaA3V`vXQd*3rPnAcwKqCRZKc_!M}99I!H-*$lD z6LQ|2w>cHlA{E(*s+*QMc|XNPlgWIYF0=Wvr9{K07dOaCbIW_x#(Yfnl6F}}l5Qn@ z7P;lpmN4O3YNwKGwH94r%_H!F?nYwaRNV(TdOMX=*)~Jp>J%NjvNBU|J8sWbgDE0) zIVb>EEa`5+S&P3%Ino}vG+2rxZf>z$>E)oEbgiknc0@Cr0vP%DX7PDD)SeGyMzdD0 zy9c)=_7(&r=TW8-DD_d=7HdS)_-FG_>_g^(w5Zm7xNTcvEP=^{niWj9?&tgLZ!fs} zr-7=5kISl;TlPM_A5oL>qDnKD4@fsE`wE9QhZ#}AqPl-5#F^c&+1R;wj3Zw1!EnRZ z(R4L!!R>tMOA8l6>A(_`)lLXJSnyf&3(GuhontTQwCgEHBpJGuy5W@d#hhG)`lI#f zVpdm8 z&%SixOz)x5viMa7X_+fE+EEtyeh#vTm^UhQpjuZpYlSWkM!g+SX^7M#U?;qE?n$dk zYPyNn8T5TrT%uCpW}Mik@yqZJ;VwGiHk*^oQYF$TA`7+C4X5G(W1GDvY_-~!&}}V@ z)(5-!{_US1zXqxtCRtj!L>n$O8Iu}PKa8KUQC*&y+gbv7(+#;Eco-SZ+KJC@8j5=P zH&^!P=;}r;*Tbs&vmW2F3k`)`M|h{LHORSssg`|7Io&v&1@M=6no*Nc4dn6o9-^eb z>qn|vT(i2B$xp+&F1$M!F}BHHqy8jKy)u(KJFU^oiF-(wG@SM`=`D~I4U_|r3$`pG z1}SfEwjEtyc%8Iq_OgO0(2cD^>BFCGQxO?NAF5OuB&rgI3tQfp$bW@Nn2?#a3*NGI z~EPxClJNm)%Iu`VGStgm&4!4| z7it;j3NvZS5^OA{sttzyPWzeau$?RhjQO4DG10p%PDq@@C7ql8_eR4Qs&nO^tQ)sk92cl|25W?POUXhc15O zJOY?2U2*+8k&FvQci-*0f?Sh$v=on>AWC6sg3f?oEM7vb3fB!kB%aZ`KDy6j@XOa_ z_FlSJ!J@?W9H@19XWRVt{OtL#Zl2yCR|}KZ_JyWhT7n)MIVNbuY_FIh@FJ0cVj$SF z`pE_#+io52Ko@HlJHIr`=Z^XGJfFB3K_UZUw%39N1ow)xL)dquWe@8PKda%CM$pTr zZtXwx>+h@mtVhY+KC$+VH|PZ(>*iiPkgf6Ew3654!@LMUVo}4`aE?v{WW2SjP}11D zwSP|EujlrJZ*L7_#{k_Rk;_<^iGQ}yTd>+>hH}}>T&}!E+IGuZ7vuaY7$Yy;8q8=j z=MTyLzBr+V=6zi@2MzB!40lmqG156+*eHOMfowPSSbk*LhQEv~kYnkhL|zjPFxbyn zB~jvDq4(c?%W)8lnist_Jg5?7lSbouo;zci$yuCNgK$|eL6oYbYdO1?a`tDlY09>U z8mvyA);JzJ7pX=5WS<&psD7+btlW0uWbAES`LJJ)S1ILtBbV0Hz#pAWs~B4&e=D<} z``J9`-B`&Leb+Tv;F6+y@ePA8|1)`BNZ9RrT;vdXkge)Slm||0 zWmmt%OOKEcHQ0VC@42>YY|eXNgW;#=n^;os*4QEiPnkRW zv`m=UOh`w}$NGlTl`uq8Pm?*>zlaaC&6}fIQi#;j`sA`S{W7y#Gp?O3ZjBX>I~yw1RGslg_m#fG z)T{=kl`qm0VbQsg$^_ffEjqpiRyow;O4NgBbg^j@{>Rld#!8<-hC?X-yxHgKt|0U6 z>bm2Pxq0ERD*9u$c)oUF>eIS~(16)_9g4$G`=QuUA((Y*9bME7-F7g&c6{H?Z} z>VSRIFx6zEg+EhCE6bpOijlR@xjasDYyqY>Bq}7CM6J>^m{h2b6Hw4a5bFBe6`lBk zI?9)HuQ1$!NbM3zjBJO_HoykUiri_nPWWrS_9Lmd8>uw$=9buX`wZFXhES- z0p%3_L7OW*Ocb=YwuAg5D%g_mGI=B=xy_Q;s5gA4dy=HZ!(Dr~5xw6$8nCNhc*WFzSt))ePsWEFZAJ+eF?-TyWQebto#bM)Ct0dlNzc=FYSAVJNhPYHoqKk% z#butF_FC%4)B6-q1zgo3Q}M^*vk6(3XH$yW?ZT@aB#qUtS{<9g54w}SCyHj3KF3$j z2f+@f8`IeRtaneTO zO}DM~YPe)%V@U6dY0fT>K5=WD87vzXp+S&RHJ)mkq~US#{(Y`$9i`V%Yh?)Y2l~NL zZZc;a0Q)t+vj(v~^TY%ny9>h?g==GZeX)KNuA9hvF$1@@@7MIS+t6V={QGbWvtzY` z@_dr^9&9GFzwV0@<)PA+=Zr6Y8r#(RaLAk;s%+Y8Ta<1a``8RoM71KR@NJgNS^x7l zy+?x$)b?rRnXt+?WL7wu9`gLf-%IG`9PHr6ejjI8^?9oMbqJA6&;3Z*W^QeAYmY5R z=Gab=9TbUXPA~4#oy7oPNAxUiv04Jyxe!wk|1C0sv^M z4QBCk+sLH7k#H3&W!6jS6uw5a8=rOOLNohpDqF0Z@S$sS@Ga8@dAe`5TC3q=nxhw` zz3NNjc^?h7wl^N!Kk{cZ=&kO%%NiZztxr(mo4{ooG`z_7{EcNAddQw`h2*_IP&qN2 zY#LChBl=F=dWsSEIA+X_5%+%443P07Twxr5DlK-7co9ZWMDj7A^FZ*Bw9-0d|*AZh0l4V^^SljI~{G=Cb`s4c-f{Cbr zCwRHmRI@Ra)yg`A{0eESrgdbAI9=et{-KY(Hp;~z0O^?`nzF4J4Bj6%HH%a3?khz0 zk%y94zC2r?)Tv|W(`L`WP&>+(gfnY0o08!tm)+Ft5r_%i?Ayyw5stBgXq1@L2bZf1 zD}Hd6JL3UiI+aSO`Yz^`OgY9HX(w+^4ddx_`781Ip#*nAj!36jf8J-_h}jrmYv0L; zzL^{Git_MY`W|yADPyVwfVE3P^$(3p-M19)?x{=a$q~vvK`A0{5~V!K&`Lomo+(C^ zuwX#gQG;R{c7#Ypm4xK2C!_irD2QasYSVY17xSz+BEVRG)~xFUHu;Lrtwn!CMC9Wi z_q9<5(HJDD5y=t*2K}&dXpjgiKd$Ig2950IMcjL%CrKnsY;N#8g6`dnYR07bdkXB_ z-|SmMWUrmeeQ86&PVzs#qK#V>67kD1$`G4V6Eg17Od`zm$lN# zMRTswNix_jIY{ym-6Y_|=VNf+539IS^z7ws^Cd8=DERInRdSG7j|R@8AS0}lZU;h? zg-DVFQssIxSVveVgtUl+%hXv%GT>NU*C0wgWAaGDxF^B4CRqTQCrxQLvRe?5i6`{c zgh-fZWrUd04z+0)#0`0gi@Vzz@qvtY#GlemYCYmw zm9d*85p;)3Opw=S+G-hFa>E^*5arvmGpr+C1vZzgSYHyS_1KlRn@emXWD#qxB0gN# zgAout$DM`2v2v%u3oA2Vg4m-z3GBUI)j-HLVtf#Rb$a=rXY(5ShoT6&Q66!XjJsXz zR%{TjPnU7Y0>wX8pQvWsm184U&3GQ7Dx=Flw?IKkO7i+B{LIOKbTut)^`}x~Bx8pk z3T6DsnT4g8N{}gQxR&^Eb=L|8TAue#0GzERe~vZgq-1<@`&o3fZ?bz18t_FPrMEsl z{>^N^cEmk&>n+s2I^<|3>J)G~Lvnf=5vfYa%ES=!CS#@_RQ!qN@j zJb35(2=J2xzXnr`{tXt{9{oP~-f&svgQFQmHq{4sv@IgdgHLYpk+W~%jM?84j_-js(|Hb>$ zT8+-1-zvmqvo4N@YcjbN{@HZ^4v zaspzZ@}G=i+){6H@S{`|Z&967IYggP^PO_$ghV}U&pxlCWhK1!X5We!ZtQfFI&c1bOA>!y0i876F>aJ)=P;4n()a6yB}g-Eq62#8N@dWHJnAGwJ*CIV4@3IT4bFo#HOxdfvkP05>zc#GMB5S1I~CgQ+#_Xh=x+0JK06k)WQ zyl3zFz9>vih>5Jq8B;#H;HaOYuJ`IgL3Ehrcrnp$tQ`bsCir*WEe7{K)2Fbl zXq-W0eamQtIc@h&658kG2Cg?o7o%T!omG7BKC-{`_}PV6^uz4h+b!M?k!Bs7i3|%I z94nMzRJc7L?!sL`yTwN_;Fgq6qD06Z6{)Pt4d(qx0?Bi?ZTkUZi2KR2JX@$&YjX!YSSWGWnWAhuN%0Zi zn`sMPh7~EyYrj4iD2C{5z!Q<7_a_zY`b#-;_YI{CrKiG3cr~U9IxO^;6r|+ zX%~Hdu7A$YJ3GvTp8Z%F?--O7-aUJFRuOF(5G%XiAJ2i74&G~T3OXHrj(Bx26yGQx z?BH|6qkj#U8z!d?`_i5*x`@=_p+c*WOXjze$=bhT?8NMncN&*FrY*%DtqgOlToM$} z2rd4XVEw{0|0O{GfSf< zpoAb+2ug@tF)0^_JrXD^goTPInCn9zmd>bR38y3U0o>jNb~Sx~gkJG8un-m*T^;3W z!Y3pyj%8*LHx-zpy~AH0jsKbm{00qw5u(4Z^nYQWzp0_nzp$ajq&gy8faKaI|DZ_p zLgDhHycrRb=0apIBmF%2jo5d}^h|)yU7c2&4|8AOuB+c!gV?WON|9pn2Ndj6P&{ZZ z8CY9g=LW826_pavT&2&wA6AKP<9XQ*zu$4+%*iZp#FIP7>_L+7xa94Ne4-j{Crs>a zC3fi;mmc4XKa>R*#V5tT#7|^*J>)*mzC$6TwcsV$?`qz}^&Qba^Nm7{k+zAaLiNjL zX%BD#UnzCX6{^8Au@PKY)Bz(=wt(H#G_=c zj!887;mI3)n!wIrjbyR^qj>&C&t_uSpo@x%0l{Dips0WVP)Jk&%lm|XjltOdYW&Lv z`|Ya#`zMUG7ZL|sT=~9AnU4p|4j=#&6b1niVBMY?STKQ6T|}3s|JMsqr=wN zzjcB_SXTHiJ8T{PM<)cvlJI}oiHQ7HCn|#Fu>Z0X6UH*&f9bFk``getPages())); } + + /** + * @see https://github.com/smalot/pdfparser/pull/797 + */ + public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797.pdf'); + + self::assertCount(1, $document->getPages()); + } } From cc85357479fac7a8dbe0aa5eec76e321fadf3552 Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Fri, 24 Apr 2026 12:05:48 -0300 Subject: [PATCH 06/10] test: clarify pull request fixture provenance Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- ...PullRequest797.pdf => PullRequest797-pdf.js.pdf} | Bin ...fWhitespaceStart.pdf => PullRequest797-vera.pdf} | Bin .../PHPUnit/Integration/DocumentIssueFocusTest.php | 7 +++---- 3 files changed, 3 insertions(+), 4 deletions(-) rename samples/bugs/{PullRequest797.pdf => PullRequest797-pdf.js.pdf} (100%) rename samples/bugs/{PullRequestXrefWhitespaceStart.pdf => PullRequest797-vera.pdf} (100%) diff --git a/samples/bugs/PullRequest797.pdf b/samples/bugs/PullRequest797-pdf.js.pdf similarity index 100% rename from samples/bugs/PullRequest797.pdf rename to samples/bugs/PullRequest797-pdf.js.pdf diff --git a/samples/bugs/PullRequestXrefWhitespaceStart.pdf b/samples/bugs/PullRequest797-vera.pdf similarity index 100% rename from samples/bugs/PullRequestXrefWhitespaceStart.pdf rename to samples/bugs/PullRequest797-vera.pdf diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 6610a5dc4..69e07e0fb 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -36,7 +36,6 @@ namespace PHPUnitTests\Integration; use PHPUnitTests\TestCase; -use Smalot\PdfParser\Document; use Smalot\PdfParser\Parser; /** @@ -112,9 +111,9 @@ public function testPDFDocEncodingDecode(): void self::assertStringContainsString($testSubject, $details['Subject']); } - public function testParseFileWhenStartxrefPointsToLeadingWhitespace(): void + public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInVeraPdfFixture(): void { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestXrefWhitespaceStart.pdf'); + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-vera.pdf'); self::assertSame(1, count($document->getPages())); } @@ -124,7 +123,7 @@ public function testParseFileWhenStartxrefPointsToLeadingWhitespace(): void */ public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797.pdf'); + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-pdf.js.pdf'); self::assertCount(1, $document->getPages()); } From cbd0bbfc3bb8b26d2eb09b069e60c24925c99d63 Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Sat, 25 Apr 2026 20:39:21 -0300 Subject: [PATCH 07/10] test(rawdata): keep PR796/797 regressions in RawDataParserTest only --- .../Integration/DocumentIssueFocusTest.php | 18 +----------------- .../Integration/RawData/RawDataParserTest.php | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 69e07e0fb..7c7fe7e68 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -36,6 +36,7 @@ namespace PHPUnitTests\Integration; use PHPUnitTests\TestCase; +use Smalot\PdfParser\Document; use Smalot\PdfParser\Parser; /** @@ -110,21 +111,4 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } - - public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInVeraPdfFixture(): void - { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-vera.pdf'); - - self::assertSame(1, count($document->getPages())); - } - - /** - * @see https://github.com/smalot/pdfparser/pull/797 - */ - public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void - { - $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-pdf.js.pdf'); - - self::assertCount(1, $document->getPages()); - } } diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index 4784595f4..eec54df06 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -328,4 +328,21 @@ public function testParseFileWithCompressedObjRefInXrefStream(): void self::assertCount(1, $document->getPages()); } + + public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInVeraPdfFixture(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-vera.pdf'); + + self::assertCount(1, $document->getPages()); + } + + /** + * @see https://github.com/smalot/pdfparser/pull/797 + */ + public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-pdf.js.pdf'); + + self::assertCount(1, $document->getPages()); + } } From 1f71566f19481ddeb9e3c7cf7621abed9fe2fd40 Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Sat, 25 Apr 2026 21:35:26 -0300 Subject: [PATCH 08/10] test(rawdata): add fixture source @see links for PR796 --- tests/PHPUnit/Integration/RawData/RawDataParserTest.php | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index eec54df06..9475e6613 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -321,6 +321,7 @@ public function testGetXrefDataTracksVisitedOffsets(): void * Ensure parser resolves compressed object references from xref streams. * * @see https://github.com/smalot/pdfparser/pull/796 + * @see https://github.com/veraPDF/veraPDF-corpus/blob/staging/PDF_A-1b/6.1%20File%20structure/6.1.2%20File%20header/veraPDF%20test%20suite%206-1-2-t01-fail-a.pdf */ public function testParseFileWithCompressedObjRefInXrefStream(): void { @@ -329,6 +330,9 @@ public function testParseFileWithCompressedObjRefInXrefStream(): void self::assertCount(1, $document->getPages()); } + /** + * @see https://github.com/veraPDF/veraPDF-corpus/blob/staging/PDF_A-1b/6.1%20File%20structure/6.1.2%20File%20header/veraPDF%20test%20suite%206-1-2-t01-fail-a.pdf + */ public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInVeraPdfFixture(): void { $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest797-vera.pdf'); @@ -338,6 +342,7 @@ public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInVeraPdfFixt /** * @see https://github.com/smalot/pdfparser/pull/797 + * @see https://github.com/mozilla/pdf.js/blob/master/test/pdfs/issue9252.pdf */ public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void { From 0cb2995f5a63b30b3ca31289cfea0ea212485620 Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Sat, 25 Apr 2026 21:39:44 -0300 Subject: [PATCH 09/10] style(test): fix @see indentation in RawDataParserTest --- tests/PHPUnit/Integration/RawData/RawDataParserTest.php | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index 9475e6613..cfcab1e3b 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -321,7 +321,7 @@ public function testGetXrefDataTracksVisitedOffsets(): void * Ensure parser resolves compressed object references from xref streams. * * @see https://github.com/smalot/pdfparser/pull/796 - * @see https://github.com/veraPDF/veraPDF-corpus/blob/staging/PDF_A-1b/6.1%20File%20structure/6.1.2%20File%20header/veraPDF%20test%20suite%206-1-2-t01-fail-a.pdf + * @see https://github.com/veraPDF/veraPDF-corpus/blob/staging/PDF_A-1b/6.1%20File%20structure/6.1.2%20File%20header/veraPDF%20test%20suite%206-1-2-t01-fail-a.pdf */ public function testParseFileWithCompressedObjRefInXrefStream(): void { @@ -342,7 +342,7 @@ public function testParseFileWhenStartxrefPointsToLeadingWhitespaceInVeraPdfFixt /** * @see https://github.com/smalot/pdfparser/pull/797 - * @see https://github.com/mozilla/pdf.js/blob/master/test/pdfs/issue9252.pdf + * @see https://github.com/mozilla/pdf.js/blob/master/test/pdfs/issue9252.pdf */ public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void { From 6e3695c410f819cbaa7ddae1d25fb5703801b734 Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Mon, 27 Apr 2026 12:58:59 -0300 Subject: [PATCH 10/10] fix(rawdata): recover xref_command_missing in PR796 stack --- .../PullRequest815-xref-command-missing.pdf | Bin 0 -> 631 bytes .../PdfParser/RawData/RawDataParser.php | 110 +++++++++++++++++- .../Integration/RawData/RawDataParserTest.php | 10 ++ 3 files changed, 116 insertions(+), 4 deletions(-) create mode 100644 samples/bugs/PullRequest815-xref-command-missing.pdf diff --git a/samples/bugs/PullRequest815-xref-command-missing.pdf b/samples/bugs/PullRequest815-xref-command-missing.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2795a146cca52620ee72d1d33f0ba44c97ed46a3 GIT binary patch literal 631 zcmZWm%TB{E5WMeK?4=SMisQ5n9I8t6kqaQIG!lo3gPXX)RcoWzkqSS_fj@vB!FEX~ zVEJH;XLe?Hdy84xrx%yd`}ll&2Z|Bu^%h|?0>11CqZy3nWu+`A%0b9)nP2 zVEX?~awAKFtAUq1wwh1|$W%AVB0ZIQUMF4fS~$YDT_%if>caVlD5-W|#}~z6NaP~8 zOYTG|`9vRZ6*)MLB4p7oYwxVAD{YXq!Zy3F)CHJuBZ3dfw3wkm_0)*#f6sde_e6R;bE8;qb~cjI9OV!z2glqhme;lG5 0; + + if (strpos($pdfData, 'xref', $bumpOffset) === $bumpOffset || $xrefSubsectionAtOffset) { + // No startxref stanza, but caller already points to an xref table/subsection. + $startxref = $bumpOffset; + } elseif (preg_match('/^[0-9]+[\s]+[0-9]+[\s]+obj/i', substr($pdfData, $bumpOffset, 32)) > 0) { + // No startxref stanza, but caller points to an xref stream object. + $startxref = $bumpOffset; + } else { + // No valid startxref table was found. Try to recover from nearby xref data + // or reconstruct a minimal xref from object headers plus trailer metadata. + $recoveredXref = $this->recoverXrefWithoutStartxref($pdfData); + if (!empty($recoveredXref)) { + return $recoveredXref; + } + + throw new \Exception('Unable to find startxref'); + } } elseif (0 == $offset) { // Use the last startxref in the document $startxref = (int) $startxrefMatches[\count($startxrefMatches) - 1][1]; - } elseif (strpos($pdfData, 'xref', $bumpOffset) == $bumpOffset) { + } elseif (strpos($pdfData, 'xref', $bumpOffset) === $bumpOffset) { // Already pointing at the xref table $startxref = $bumpOffset; - } elseif (preg_match('/([0-9]+[\s][0-9]+[\s]obj)/i', $pdfData, $matches, 0, $bumpOffset)) { + } elseif (preg_match('/^[0-9]+[\s]+[0-9]+[\s]+obj/i', substr($pdfData, $bumpOffset, 32)) > 0) { // Cross-Reference Stream object $startxref = $bumpOffset; } else { @@ -967,12 +986,95 @@ protected function getXrefData(string $pdfData, int $offset = 0, array $xref = [ } } if (empty($xref)) { + $recoveredXref = $this->recoverXrefWithoutStartxref($pdfData); + if (!empty($recoveredXref)) { + return $recoveredXref; + } + throw new \Exception('Unable to find xref'); } return $xref; } + /** + * Attempt to recover xref/trailer data when no valid startxref stanza exists. + */ + private function recoverXrefWithoutStartxref(string $pdfData): array + { + $trailerPos = strrpos($pdfData, 'trailer'); + $recoveredOffset = null; + + if (false !== $trailerPos) { + $searchStart = max(0, $trailerPos - 8192); + $searchChunk = substr($pdfData, $searchStart, $trailerPos - $searchStart); + $lastXrefPos = strrpos($searchChunk, 'xref'); + if (false !== $lastXrefPos) { + $candidateOffset = $searchStart + $lastXrefPos; + if ( + preg_match('/xref[\x09\x0a\x0c\x0d\x20]/', substr($pdfData, $candidateOffset, 5)) > 0 + && preg_match('/xref[\s]*[\r\n]+[0-9]+[\x20]+[0-9]+[\x20]*[\r\n]/A', substr($pdfData, $candidateOffset, 96)) > 0 + ) { + $recoveredOffset = $candidateOffset; + } + } + } + + if (null !== $recoveredOffset) { + return $this->getXrefData($pdfData, $recoveredOffset); + } + + $xref = ['xref' => [], 'trailer' => []]; + if ( + preg_match_all('/([0-9]+)[\x20]+([0-9]+)[\x20]+obj\b/i', $pdfData, $objMatches, \PREG_OFFSET_CAPTURE) > 0 + ) { + foreach ($objMatches[0] as $i => $fullMatch) { + $objNum = (int) $objMatches[1][$i][0]; + $genNum = (int) $objMatches[2][$i][0]; + $xref['xref'][$objNum.'_'.$genNum] = $fullMatch[1]; + } + + if (false !== $trailerPos) { + $trailerEnd = strpos($pdfData, '%%EOF', $trailerPos); + if (false === $trailerEnd) { + $trailerEnd = min( + \strlen($pdfData), + $trailerPos + 4096 + ); + } + $trailerData = substr($pdfData, $trailerPos, $trailerEnd - $trailerPos); + + if (preg_match('/\/?Size[\s]+([0-9]+)/i', $trailerData, $matches) > 0) { + $xref['trailer']['size'] = (int) $matches[1]; + } + if (preg_match('/\/?Root[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailerData, $matches) > 0) { + $xref['trailer']['root'] = (int) $matches[1].'_'.(int) $matches[2]; + } + if (preg_match('/\/?Encrypt[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailerData, $matches) > 0) { + $xref['trailer']['encrypt'] = (int) $matches[1].'_'.(int) $matches[2]; + } + if (preg_match('/\/?Info[\s]+([0-9]+)[\s]+([0-9]+)[\s]+R/i', $trailerData, $matches) > 0) { + $xref['trailer']['info'] = (int) $matches[1].'_'.(int) $matches[2]; + } + if (preg_match('/ID[\s]*[\[]\s*[<]([^>]*)[>][\s]*[<]([^>]*)[>]/i', $trailerData, $matches) > 0) { + $xref['trailer']['id'] = []; + $xref['trailer']['id'][0] = $matches[1]; + $xref['trailer']['id'][1] = $matches[2]; + } + } + } + + if (empty($xref['xref'])) { + return []; + } + + if (!isset($xref['trailer']['size'])) { + $xref['trailer']['size'] = \count($xref['xref']) + 1; + } + + return $xref; + } + /** * Parses PDF data and returns extracted data as array. * diff --git a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php index cfcab1e3b..7f94fd7a6 100644 --- a/tests/PHPUnit/Integration/RawData/RawDataParserTest.php +++ b/tests/PHPUnit/Integration/RawData/RawDataParserTest.php @@ -350,4 +350,14 @@ public function testParseFileWithCompressedXrefObjectFromPdfJsCorpus(): void self::assertCount(1, $document->getPages()); } + + /** + * @see https://github.com/mozilla/pdf.js/blob/master/test/pdfs/xref_command_missing.pdf + */ + public function testParseFileWhenXrefCommandIsMissingInPdfJsFixture(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequest815-xref-command-missing.pdf'); + + self::assertCount(1, $document->getPages()); + } }