Skip to content

Commit b8bdac8

Browse files
authored
Merge pull request #42 from nextras/fix/chunk-boundary-parsing
Fix chunked stream parsing across chunk boundaries
2 parents 4d1ef32 + 4aac4f2 commit b8bdac8

7 files changed

Lines changed: 560 additions & 19 deletions

src/MySqlMultiQueryParser.php

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ private function getQueryPattern(string $delimiter): string
3232
~
3333
(?:
3434
\\s
35-
| /\\* (?: [^*]++ | \\*(?!/) )*+ \\*/
35+
| /\\* (*PRUNE) (?: [^*]++ | \\*(?!/) )*+ \\*/
3636
| --[^\\n]*+(?:\\n|\\z)
3737
)*+
3838
@@ -47,9 +47,9 @@ private function getQueryPattern(string $delimiter): string
4747
(?<query>
4848
(?:
4949
[^$delimiterFirstBytePattern'\"/$-]++
50-
| ' (?: \\\\. | [^'] )*+ '
51-
| \" (?: \\\\. | [^\"] )*+ \"
52-
| /\\* (?: [^*]++ | \\*(?!/) )*+ \\*/
50+
| ' (*PRUNE) (?: \\\\. | [^'] )*+ '
51+
| \" (*PRUNE) (?: \\\\. | [^\"] )*+ \"
52+
| /\\* (*PRUNE) (?: [^*]++ | \\*(?!/) )*+ \\*/
5353
| --[^\\n]*+(?:\\n|\\z)
5454
| (?!$delimiterPattern) .
5555
)*+

src/PatternIterator.php

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,24 @@
1212

1313

1414
/**
15+
* Applies a regex pattern to a chunked string stream, yielding matches sequentially.
16+
*
17+
* Safety mechanism: when a match consumes all remaining data in the buffer and the stream
18+
* has more chunks, the match is held back (not yielded) until more data is loaded. This
19+
* prevents yielding incomplete matches at chunk boundaries.
20+
*
21+
* Pattern design constraint: patterns with opening/closing delimiter constructs (such as
22+
* string literals `'...'`, block comments `/*...* /`, or dollar-quoted strings `$$...$$`)
23+
* must include `(*PRUNE)` after the opening delimiter, e.g. `' (*PRUNE) [^']* '`.
24+
* Without this, when a chunk boundary falls inside such a construct, the closing delimiter
25+
* is absent from the buffer, the construct fails to match, and the regex falls back to a
26+
* generic single-character alternative (e.g. `(?!;) .`). This exposes characters inside the
27+
* construct (like semicolons inside a string) as false delimiters, producing an incorrect
28+
* match that terminates in the middle of the buffer — where the safety mechanism cannot
29+
* detect the problem. The `(*PRUNE)` verb ensures that once the opening delimiter matches,
30+
* the regex engine commits to the construct — if the closing delimiter is missing (because
31+
* it is in a later chunk), the overall match fails, causing the iterator to load more data.
32+
*
1533
* @implements IteratorAggregate<int, array<mixed>>
1634
*/
1735
class PatternIterator implements IteratorAggregate

src/PostgreSqlMultiQueryParser.php

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,20 @@ private function getQueryPattern(): string
2727
return /** @lang PhpRegExp */ '~
2828
(?:
2929
\\s
30-
| /\\* (?: [^*]++ | \\*(?!/) )*+ \\*/
30+
| /\\* (*PRUNE) (?: [^*]++ | \\*(?!/) )*+ \\*/
3131
| -- [^\\n]*+
3232
)*+
3333
3434
(?:
3535
(?:
3636
(?<query>
3737
(?:
38-
[^;\'"/$-]++
39-
| \' (?: [^\'] )*+ \'
40-
| [eE]\' (?: \\\\. | [^\'] )*+ \'
41-
| " (?: [^"] )*+ "
42-
| /\\* (?: [^*]++ | \\*(?!/) )*+ \\*/
43-
| (\\$(?:[a-zA-Z_\\x80-\\xFF][\\w\\x80-\\xFF]*+)?\\$) (?: [^$]++ | (?!\\g{-1})\\$ )*+ \\g{-1}
38+
(?:[^;\'"/$eE-]|[eE](?!\'))++
39+
| \' (*PRUNE) (?: [^\'] )*+ \'
40+
| [eE]\' (*PRUNE) (?: \\\\. | [^\'] )*+ \'
41+
| " (*PRUNE) (?: [^"] )*+ "
42+
| /\\* (*PRUNE) (?: [^*]++ | \\*(?!/) )*+ \\*/
43+
| (\\$(?:[a-zA-Z_\\x80-\\xFF][\\w\\x80-\\xFF]*+)?\\$) (*PRUNE) (?: [^$]++ | (?!\\g{-1})\\$ )*+ \\g{-1}
4444
| -- [^\\n]*+
4545
| (?!;) .
4646
)*+

src/SqlServerMultiQueryParser.php

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ private function getQueryPattern(): string
2424
$simpleQuery = /** @lang PhpRegExp */ '~
2525
(?:
2626
\\s
27-
| /\\* (?: [^*]++ | \\*(?!/) )*+ \\*/
27+
| /\\* (*PRUNE) (?: [^*]++ | \\*(?!/) )*+ \\*/
2828
| -- [^\\n]*+
2929
)*+
3030
(?<simplequery>
3131
(?:
3232
[^;\'"[/-]++
33-
| \' (?: [^\'] )*+ \'
34-
| " (?: [^"] )*+ "
35-
| /\\* (?: [^*]++ | \\*(?!/) )*+ \\*/
33+
| \' (*PRUNE) (?: [^\'] )*+ \'
34+
| " (*PRUNE) (?: [^"] )*+ "
35+
| /\\* (*PRUNE) (?: [^*]++ | \\*(?!/) )*+ \\*/
3636
| -- [^\\n]*+
3737
| (?!;) .
3838
)++
@@ -42,7 +42,7 @@ private function getQueryPattern(): string
4242
return /** @lang PhpRegExp */ '~
4343
(?:
4444
\\s
45-
| /\\* (?: [^*]++ | \\*(?!/) )*+ \\*/
45+
| /\\* (*PRUNE) (?: [^*]++ | \\*(?!/) )*+ \\*/
4646
| -- [^\\n]*+
4747
)*+
4848
@@ -51,9 +51,9 @@ private function getQueryPattern(): string
5151
(?<query>
5252
(?:
5353
[^B;\'"[/-]++
54-
| \' (?: [^\'] )*+ \'
55-
| " (?: [^"] )*+ "
56-
| /\\* (?: [^*]++ | \\*(?!/) )*+ \\*/
54+
| \' (*PRUNE) (?: [^\'] )*+ \'
55+
| " (*PRUNE) (?: [^"] )*+ "
56+
| /\\* (*PRUNE) (?: [^*]++ | \\*(?!/) )*+ \\*/
5757
| BEGIN (?: \s*END\s*| ' . substr($simpleQuery, 1, -2) . ')*
5858
| -- [^\\n]*+
5959
| (?!;) .

tests/cases/MySqlMultiQueryParserTest.phpt

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
namespace Nextras\MultiQueryParser;
88

99
use LogicException;
10+
use Nextras\MultiQueryParser\Exception\RuntimeException;
1011
use Tester\Assert;
1112
use Tester\TestCase;
1213

@@ -189,6 +190,165 @@ class MySqlMultiQueryParserTest extends TestCase
189190
],
190191
];
191192
}
193+
194+
195+
/**
196+
* @dataProvider provideEdgeCasesData
197+
* @param list<string> $expectedQueries
198+
*/
199+
public function testEdgeCases(string $content, array $expectedQueries): void
200+
{
201+
$parser = new MySqlMultiQueryParser();
202+
$queries = iterator_to_array($parser->parseString($content));
203+
Assert::same($expectedQueries, $queries);
204+
}
205+
206+
207+
public function testParseFileThrowsOnNonExistentFile(): void
208+
{
209+
$parser = new MySqlMultiQueryParser();
210+
Assert::exception(function () use ($parser) {
211+
$parser->parseFile(__DIR__ . '/data/nonexistent.sql');
212+
}, RuntimeException::class);
213+
}
214+
215+
216+
/**
217+
* @return list<array{string, list<string>}>
218+
*/
219+
protected function provideEdgeCasesData(): array
220+
{
221+
return [
222+
// Empty / whitespace-only input
223+
['', []],
224+
[" \n\t\n ", []],
225+
226+
// Semicolons inside string literals are not delimiters
227+
["SELECT 'a;b';", ["SELECT 'a;b'"]],
228+
['SELECT "a;b";', ['SELECT "a;b"']],
229+
["SELECT 'a;b', 'c;d';", ["SELECT 'a;b', 'c;d'"]],
230+
["SELECT ';;;';", ["SELECT ';;;'"]],
231+
["SELECT '';", ["SELECT ''"]],
232+
233+
// Backslash escaping in strings
234+
["SELECT 'it\\'s';", ["SELECT 'it\\'s'"]],
235+
['SELECT "col\\"name";', ['SELECT "col\\"name"']],
236+
["SELECT '\\;';", ["SELECT '\\;'"]],
237+
238+
// Semicolons inside comments are not delimiters
239+
["SELECT /* ; */ 1;", ["SELECT /* ; */ 1"]],
240+
["SELECT /* ; ; ; */ 1;", ["SELECT /* ; ; ; */ 1"]],
241+
242+
// Line comment between queries (semicolon before comment)
243+
["SELECT 1; -- has ; in comment\nSELECT 2;", ["SELECT 1", "SELECT 2"]],
244+
245+
// Line comment inside a query captures everything until next real delimiter
246+
["SELECT 1 -- comment with ;\nSELECT 2;", ["SELECT 1 -- comment with ;\nSELECT 2"]],
247+
248+
// Queries without trailing semicolon (terminated by end of input)
249+
["SELECT 1", ["SELECT 1"]],
250+
["SELECT 1; SELECT 2", ["SELECT 1", "SELECT 2"]],
251+
252+
// Forward slash and dash not starting comments
253+
["SELECT 5/3;", ["SELECT 5/3"]],
254+
["SELECT 5-3;", ["SELECT 5-3"]],
255+
["SELECT 1 / 2 + 3 - 4;", ["SELECT 1 / 2 + 3 - 4"]],
256+
257+
// Comment positioning
258+
["/* prefix */ SELECT 1;", ["SELECT 1"]],
259+
["-- prefix\nSELECT 1;", ["SELECT 1"]],
260+
["SELECT 1; /* between */ SELECT 2;", ["SELECT 1", "SELECT 2"]],
261+
["SELECT 1; -- between\nSELECT 2;", ["SELECT 1", "SELECT 2"]],
262+
263+
// Only comments (no queries)
264+
["/* only a comment */", []],
265+
["-- only a comment", []],
266+
["-- line 1\n-- line 2\n", []],
267+
["/* c1 */ /* c2 */", []],
268+
269+
// Block comment edge cases
270+
["SELECT /* contains * star */ 1;", ["SELECT /* contains * star */ 1"]],
271+
["SELECT /* contains /* slash-star */ 1;", ["SELECT /* contains /* slash-star */ 1"]],
272+
273+
// DELIMITER is case-insensitive
274+
[
275+
"delimiter //\nSELECT 1//\nDELIMITER ;",
276+
["SELECT 1"],
277+
],
278+
[
279+
"Delimiter //\nSELECT 1//\nDELIMITER ;",
280+
["SELECT 1"],
281+
],
282+
283+
// CRLF line endings
284+
["SELECT 1;\r\nSELECT 2;\r\n", ["SELECT 1", "SELECT 2"]],
285+
286+
// Consecutive string literals
287+
["SELECT 'a' 'b';", ["SELECT 'a' 'b'"]],
288+
289+
// Whitespace variations
290+
["SELECT\t1;", ["SELECT\t1"]],
291+
["\n\nSELECT 1;\n\n", ["SELECT 1"]],
292+
];
293+
}
294+
295+
296+
/**
297+
* @dataProvider provideChunkBoundaryData
298+
* @param list<string> $chunks
299+
* @param list<string> $expectedQueries
300+
*/
301+
public function testChunkBoundary(array $chunks, array $expectedQueries): void
302+
{
303+
$parser = new MySqlMultiQueryParser();
304+
$queries = iterator_to_array($parser->parseStringStream(new \ArrayIterator($chunks)));
305+
Assert::same($expectedQueries, $queries);
306+
}
307+
308+
309+
/**
310+
* @return list<array{list<string>, list<string>}>
311+
*/
312+
protected function provideChunkBoundaryData(): array
313+
{
314+
// The bug triggers when a chunk boundary falls inside a string/comment
315+
// that contains a semicolon, AND there is content after the semicolon
316+
// still within the same chunk. The `;` falsely acts as a delimiter and
317+
// the remaining content prevents the match from reaching end-of-buffer
318+
// (which would trigger PatternIterator's safety mechanism).
319+
return [
320+
// Single-quoted string: chunk has content after the false `;`
321+
[
322+
["SELECT 'a;b", "c';"],
323+
["SELECT 'a;bc'"],
324+
],
325+
// Double-quoted string: same issue
326+
[
327+
['SELECT "a;b', 'c";'],
328+
['SELECT "a;bc"'],
329+
],
330+
// Block comment: chunk has content after the false `;`
331+
[
332+
["SELECT /* a;b", "c */ 1;"],
333+
["SELECT /* a;bc */ 1"],
334+
],
335+
// Multiple queries — string spans chunk, content after `;`
336+
[
337+
["SELECT 'x;y", "z'; SELECT 2;"],
338+
["SELECT 'x;yz'", "SELECT 2"],
339+
],
340+
// Block comment in leading whitespace spanning chunks
341+
[
342+
["/* x;y", "z */ SELECT 1;"],
343+
["SELECT 1"],
344+
],
345+
// String with backslash escaping spanning chunks
346+
[
347+
["SELECT 'a;b\\'", "c';"],
348+
["SELECT 'a;b\\'c'"],
349+
],
350+
];
351+
}
192352
}
193353

194354

0 commit comments

Comments
 (0)