@@ -46,10 +46,10 @@ class CharSequenceUtf8Test {
4646
4747 @Test(expected = IllegalArgumentException ::class )
4848 fun testChunkTooSmall () {
49- " \u0000 " .chunkOnModifiedUtf8ByteSize(2 )
49+ " \u0000 " .chunkOnModifiedUtf8ByteSize(5 )
5050 }
5151
52- private val chunkSize = 5
52+ private val chunkSize = 6
5353
5454 @Test fun testOneNullByte () {
5555 val source = " \u0000 "
@@ -61,31 +61,33 @@ class CharSequenceUtf8Test {
6161 }
6262
6363 @Test fun testChunk () {
64+ // 3 null chars, 2 bytes each = 6 bytes total, fits exactly in one chunk
6465 val source = " \u0000\u0000\u0000 "
6566 val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
66- assertEquals(testResult.size, 2 )
67- assertEquals(testResult[0 ].length, 2 )
68- assertEquals(testResult[1 ].length, 1 )
67+ assertEquals(1 , testResult.size)
68+ assertEquals(3 , testResult[0 ].length)
6969 assertTrue(compareSourceAndChunked(source, testResult))
7070 assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
7171 }
7272
73- @Test fun testExactly5 () {
73+ @Test fun testExactly6 () {
74+ // null (2 bytes) + "123" (3 bytes) = 5 bytes, fits in one chunk
7475 val source = " \u0000 123"
7576 val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
76- assertEquals(testResult.size, 1 )
77- assertEquals(testResult[0 ].length, 4 )
77+ assertEquals(1 , testResult.size)
78+ assertEquals(4 , testResult[0 ].length)
7879 assertTrue(compareSourceAndChunked(source, testResult))
7980 assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
8081 }
8182
82- @Test fun test5CharsOverflow () {
83+ @Test fun test6BytesExact () {
84+ // "1234" (4 bytes) + null (2 bytes) = 6 bytes, fits exactly
8385 val source = " 1234\u0000 "
8486 val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
85- assertEquals(testResult.size, 2 )
86- assertEquals(testResult[0 ].length, 4 )
87- assertEquals(testResult[1 ].length, 1 )
87+ assertEquals(1 , testResult.size)
88+ assertEquals(5 , testResult[0 ].length)
8889 assertTrue(compareSourceAndChunked(source, testResult))
90+ assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
8991 }
9092
9193 @Test fun test3ByteCharFit () {
@@ -97,27 +99,28 @@ class CharSequenceUtf8Test {
9799 assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
98100 }
99101
100- @Test fun test3ByteCharOverflow () {
102+ @Test fun test3ByteCharFitsExactly () {
103+ // "1" (1 byte) + € (3 bytes) + null (2 bytes) = 6 bytes, fits exactly
101104 val source = " 1€\u0000 "
102105 val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
103- assertEquals(testResult.size, 2 )
104- assertEquals(testResult[0 ].length, 2 )
105- assertEquals(testResult[1 ].length, 1 )
106+ assertEquals(1 , testResult.size)
107+ assertEquals(3 , testResult[0 ].length)
106108 assertTrue(compareSourceAndChunked(source, testResult))
107109 assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
108110 }
109111
110112 @Test fun testNormalAscii () {
113+ // 10 ASCII chars = 10 bytes, with chunkSize=6: first 6 chars, then 4 chars
111114 val source = " 0123456789"
112115 val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
113- assertEquals(testResult.size, 2 )
114- assertEquals(testResult[0 ].length, chunkSize )
115- assertEquals(testResult[1 ].length, chunkSize )
116+ assertEquals(2 , testResult.size)
117+ assertEquals(6 , testResult[0 ].length)
118+ assertEquals(4 , testResult[1 ].length)
116119 assertTrue(compareSourceAndChunked(source, testResult))
117120 assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
118121 }
119122
120- @Test fun testLargeRandomUTf8StringChunk5 () {
123+ @Test fun testLargeRandomUTf8StringChunk6 () {
121124 val testResult = random1000Utf8Chars.chunkOnModifiedUtf8ByteSize(chunkSize)
122125 assertTrue(compareSourceAndChunked(random1000Utf8Chars, testResult))
123126 assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
@@ -135,6 +138,72 @@ class CharSequenceUtf8Test {
135138 assertTrue(testResult.checkModifiedUtf8ByteArraySize(100 ))
136139 }
137140
141+ @Test fun testSurrogatePairKeptTogether () {
142+ // Emoji 🎉 (U+1F389) is represented as surrogate pair \uD83C\uDF89
143+ // Each surrogate is 3 bytes in Modified UTF-8, so the pair is 6 bytes
144+ val source = " a\uD83C\uDF89 b" // "a🎉b"
145+ val testResult = source.chunkOnModifiedUtf8ByteSize(6 )
146+ // With chunkSize=6: "a" (1 byte) + surrogate pair (6 bytes) = 7 bytes > 6
147+ // So we expect: ["a", "🎉", "b"] - surrogate pair must stay together
148+ assertEquals(3 , testResult.size)
149+ assertEquals(" a" , testResult[0 ])
150+ assertEquals(" \uD83C\uDF89 " , testResult[1 ]) // The emoji as a unit
151+ assertEquals(" b" , testResult[2 ])
152+ assertTrue(compareSourceAndChunked(source, testResult))
153+ // Verify each chunk is valid Unicode (no lone surrogates)
154+ testResult.forEach { chunk ->
155+ chunk.toString().toCharArray().forEachIndexed { index, char ->
156+ if (char.isHighSurrogate()) {
157+ assertTrue(" High surrogate at end of chunk" , index + 1 < chunk.length)
158+ assertTrue(" High surrogate not followed by low surrogate" , chunk[index + 1 ].isLowSurrogate())
159+ }
160+ if (char.isLowSurrogate()) {
161+ assertTrue(" Low surrogate at start of chunk" , index > 0 )
162+ assertTrue(" Low surrogate not preceded by high surrogate" , chunk[index - 1 ].isHighSurrogate())
163+ }
164+ }
165+ }
166+ }
167+
168+ @Test fun testMultipleSurrogatePairs () {
169+ // Multiple emoji: 🎉🎊 (two surrogate pairs = 12 bytes)
170+ val source = " \uD83C\uDF89\uD83C\uDF8A "
171+ val testResult = source.chunkOnModifiedUtf8ByteSize(6 )
172+ // Each pair is 6 bytes, so with chunkSize=6 we get 2 chunks
173+ assertEquals(2 , testResult.size)
174+ assertEquals(" \uD83C\uDF89 " , testResult[0 ])
175+ assertEquals(" \uD83C\uDF8A " , testResult[1 ])
176+ assertTrue(compareSourceAndChunked(source, testResult))
177+ }
178+
179+ @Test fun testSurrogatePairFitsWithOtherChars () {
180+ // "ab🎉" = 1 + 1 + 6 = 8 bytes, with chunkSize=8 should fit in one chunk
181+ val source = " ab\uD83C\uDF89 "
182+ val testResult = source.chunkOnModifiedUtf8ByteSize(8 )
183+ assertEquals(1 , testResult.size)
184+ assertEquals(source, testResult[0 ])
185+ assertTrue(compareSourceAndChunked(source, testResult))
186+ }
187+
188+ @Test fun testSurrogatePairOverflow () {
189+ // "abc🎉" = 3 + 6 = 9 bytes, with chunkSize=8: "abc" then "🎉"
190+ val source = " abc\uD83C\uDF89 "
191+ val testResult = source.chunkOnModifiedUtf8ByteSize(8 )
192+ assertEquals(2 , testResult.size)
193+ assertEquals(" abc" , testResult[0 ])
194+ assertEquals(" \uD83C\uDF89 " , testResult[1 ])
195+ assertTrue(compareSourceAndChunked(source, testResult))
196+ }
197+
198+ @Test fun testSurrogatePairInMiddle () {
199+ // "a🎉b" = 1 + 6 + 1 = 8 bytes, with chunkSize=8 should fit in one chunk
200+ val source = " a\uD83C\uDF89 b"
201+ val testResult = source.chunkOnModifiedUtf8ByteSize(8 )
202+ assertEquals(1 , testResult.size)
203+ assertEquals(source, testResult[0 ])
204+ assertTrue(compareSourceAndChunked(source, testResult))
205+ }
206+
138207 fun compareSourceAndChunked (
139208 source : CharSequence ,
140209 chunked : List <CharSequence >,
0 commit comments