From 073e5d6642b1cf4becb21c696fe563ae6aa8d53d Mon Sep 17 00:00:00 2001 From: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:15:17 -0300 Subject: [PATCH] fix: accept multi-space xref subsection entries Signed-off-by: Vitor Mattos <1079143+vitormattos@users.noreply.github.com> --- .../PullRequestXrefSubsectionMultipleSpaces.pdf | Bin 0 -> 2541 bytes src/Smalot/PdfParser/RawData/RawDataParser.php | 2 +- .../Integration/DocumentIssueFocusTest.php | 7 +++++++ 3 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 samples/bugs/PullRequestXrefSubsectionMultipleSpaces.pdf diff --git a/samples/bugs/PullRequestXrefSubsectionMultipleSpaces.pdf b/samples/bugs/PullRequestXrefSubsectionMultipleSpaces.pdf new file mode 100644 index 0000000000000000000000000000000000000000..508c197476937691574c8152d80bc4bdf6dbec27 GIT binary patch literal 2541 zcmbtWU2fwx5N^>o0lmY(ivS4RI{W*d&wp8r0fl`2ht=;}WRkA5 zkU%CYu0GZ|6B+lQ=xgn zHMhv!X09vK@6+o8t{*gz`)|vF#~+1;xmc#zWvc~ga@qRLo6>195|8P-7b)&u-rvVB zALAfty&YIqzYNQDvBoe^u2-1~%gYu=HX_7#Vrv|>B;@Jo zsr}Sx=Wd1+_rje)2!BB5fL7COx{FqnLvZ?qv=2QQKZH+NcB56`lLcrQPEV$GI)+fzRjJIEp^K6mJHP2!(xZ1llt%e;jbQ<*j1LZy6 zOtq`pwc~tk=n&TKG{}axGEX+Kkb{o>PNd66tPfP_vgUS!u|6 zE#=orzU51t2e({`OoM|$pIKxi^WrKm;eq0}7kJ>i&Ib!cpQQYReI``Ck+D#K^9BbB z8ReNqI0e-pni<=6l$Zd!!BkHY6JJ^6mo!oEVCc;-(kR~`8`jv!43+)PRIdb_i?R^0 z7746{QV0_Q4kLvoqvA)w6Cp!T8fBPfOS2i5Alt0xI1YK1faKXyK$dGz0>z5Qo51`T zI4JbT01Bs;XkYBB;~94NCB>f*)bBM>+8}BczQ$7uwOQ#9aG0!4!}?#slJ}y-I+)y> zRWx2lG13eZ#YqDEG;r?}aGk{Abd1vxuFi`z2FgyOc2A+!IUG=p(-CUC;u|HPYctz@ zg}ke2oqSFFfMyQi2o2{H8G;rH4|0=g?1NO{q36=B0@#`|+mzgH)W&Tx_slIRar-u( zYmV=)s&2OIBUe}6%|@@%%(U;WVYfKjPN{I}_d^2BySbLY2rtrAzBGM?TG)wj2M$}R zH-*TCb#K=?!7Dx0C>FEpnm^-(1>~;k+?}SE0w)UI*1syTLo+I_uWqQW^=4a@QYaIbq3HMIpe_f9XgtLkvE=n>DAb|>M?iX)1KE0 Yt_B~>OsR%fbu12JzIApszL{A60{WxZ%m4rY literal 0 HcmV?d00001 diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index ec8d01e5..0df21551 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -169,7 +169,7 @@ protected function decodeXref(string $pdfData, int $startxref, array $xref = [], // initialize object number $obj_num = 0; // search for cross-reference entries or subsection - while (preg_match('/([0-9]+)[\x20]([0-9]+)[\x20]?([nf]?)(\r\n|[\x20]?[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) { + while (preg_match('/([0-9]+)[\x20]+([0-9]+)[\x20]*([nf]?)(\r\n|[\x20]*[\r\n])/', $pdfData, $matches, \PREG_OFFSET_CAPTURE, $offset) > 0) { if ($matches[0][1] != $offset) { // we are on another section break; diff --git a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php index 7c7fe7e6..3d5d3de5 100644 --- a/tests/PHPUnit/Integration/DocumentIssueFocusTest.php +++ b/tests/PHPUnit/Integration/DocumentIssueFocusTest.php @@ -111,4 +111,11 @@ public function testPDFDocEncodingDecode(): void $testSubject = '•†‡…—–ƒ⁄‹›−‰„“”‘’‚™ŁŒŠŸŽıłœšž'; self::assertStringContainsString($testSubject, $details['Subject']); } + + public function testParseFileWithXrefSubsectionHavingMultipleSpaces(): void + { + $document = (new Parser())->parseFile($this->rootDir.'/samples/bugs/PullRequestXrefSubsectionMultipleSpaces.pdf'); + + self::assertCount(1, $document->getPages()); + } }