Skip to content

Commit 5c68ff0

Browse files
rossbacherAndreas Rossbacher
andauthored
fix chunking algo to work with surrogate pairs (#389)
Co-authored-by: Andreas Rossbacher <andreas.rossbacher@airbnb.com>
1 parent f5d1bb9 commit 5c68ff0

2 files changed

Lines changed: 115 additions & 26 deletions

File tree

deeplinkdispatch-base/src/main/java/com/airbnb/deeplinkdispatch/base/Utils.kt

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ import java.io.InputStream
1313
* - U+0001 to U+007F are encoded as 1 byte
1414
* - U+0080 to U+07FF are encoded as 2 bytes
1515
* - U+0800 to U+FFFF are encoded as 3 bytes
16+
*
17+
* Note: Surrogate pairs (for characters > U+FFFF) are encoded as two 3-byte sequences (6 bytes total).
1618
*/
1719
private fun Char.modifiedUtf8ByteSize(): Int {
1820
val codePoint = this.code
@@ -25,31 +27,49 @@ private fun Char.modifiedUtf8ByteSize(): Int {
2527
}
2628

2729
/**
28-
* Chunk a CharSequence based on how long it's Modified UTF-8
30+
* Chunk a CharSequence based on how long its Modified UTF-8
2931
* (https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8) ByteArray representation would be.
3032
*
3133
* This implementation uses O(n) time complexity by calculating byte sizes incrementally
3234
* instead of creating substrings for each character.
35+
*
36+
* Surrogate pairs (characters outside the BMP) are kept together to ensure valid Unicode strings.
3337
*/
3438
fun CharSequence.chunkOnModifiedUtf8ByteSize(chunkSize: Int): List<CharSequence> {
35-
require(chunkSize >= 3) {
36-
"UTF-8 chars can be up to 3 bytes wide. Minumum chunk size is 3 bytes."
39+
require(chunkSize >= 6) {
40+
"Surrogate pairs require 6 bytes in Modified UTF-8. Minimum chunk size is 6 bytes."
3741
}
3842
val result = mutableListOf<CharSequence>()
3943
var nextChunkStart = 0
4044
var currentChunkByteSize = 0
45+
var i = 0
46+
47+
while (i < this.length) {
48+
val char = this[i]
49+
val charByteSize: Int
50+
val charsToAdvance: Int
4151

42-
for (i in 0 until this.length) {
43-
val charByteSize = this[i].modifiedUtf8ByteSize()
52+
// Check for surrogate pair - keep them together to ensure valid Unicode
53+
if (char.isHighSurrogate() && i + 1 < this.length && this[i + 1].isLowSurrogate()) {
54+
// Surrogate pair: 6 bytes total (3 + 3)
55+
charByteSize = 6
56+
charsToAdvance = 2
57+
} else {
58+
charByteSize = char.modifiedUtf8ByteSize()
59+
charsToAdvance = 1
60+
}
4461

45-
// See if this char would still fit into the chunk. If not, create chunk and start next one.
62+
// See if this char (or surrogate pair) would still fit into the chunk.
63+
// If not, create chunk and start next one.
4664
if (currentChunkByteSize + charByteSize > chunkSize) {
4765
result.add(this.subSequence(nextChunkStart, i))
4866
nextChunkStart = i
4967
currentChunkByteSize = charByteSize
5068
} else {
5169
currentChunkByteSize += charByteSize
5270
}
71+
72+
i += charsToAdvance
5373
}
5474
// If there was a chunk that we started but did not add yet, add the rest.
5575
if (nextChunkStart != length) {

deeplinkdispatch-base/src/test/java/com/airbnb/deeplinkdispatch/CharSequenceUtf8Test.kt

Lines changed: 89 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,10 @@ class CharSequenceUtf8Test {
4646

4747
@Test(expected = IllegalArgumentException::class)
4848
fun testChunkTooSmall() {
49-
"\u0000".chunkOnModifiedUtf8ByteSize(2)
49+
"\u0000".chunkOnModifiedUtf8ByteSize(5)
5050
}
5151

52-
private val chunkSize = 5
52+
private val chunkSize = 6
5353

5454
@Test fun testOneNullByte() {
5555
val source = "\u0000"
@@ -61,31 +61,33 @@ class CharSequenceUtf8Test {
6161
}
6262

6363
@Test fun testChunk() {
64+
// 3 null chars, 2 bytes each = 6 bytes total, fits exactly in one chunk
6465
val source = "\u0000\u0000\u0000"
6566
val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
66-
assertEquals(testResult.size, 2)
67-
assertEquals(testResult[0].length, 2)
68-
assertEquals(testResult[1].length, 1)
67+
assertEquals(1, testResult.size)
68+
assertEquals(3, testResult[0].length)
6969
assertTrue(compareSourceAndChunked(source, testResult))
7070
assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
7171
}
7272

73-
@Test fun testExactly5() {
73+
@Test fun testExactly6() {
74+
// null (2 bytes) + "123" (3 bytes) = 5 bytes, fits in one chunk
7475
val source = "\u0000123"
7576
val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
76-
assertEquals(testResult.size, 1)
77-
assertEquals(testResult[0].length, 4)
77+
assertEquals(1, testResult.size)
78+
assertEquals(4, testResult[0].length)
7879
assertTrue(compareSourceAndChunked(source, testResult))
7980
assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
8081
}
8182

82-
@Test fun test5CharsOverflow() {
83+
@Test fun test6BytesExact() {
84+
// "1234" (4 bytes) + null (2 bytes) = 6 bytes, fits exactly
8385
val source = "1234\u0000"
8486
val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
85-
assertEquals(testResult.size, 2)
86-
assertEquals(testResult[0].length, 4)
87-
assertEquals(testResult[1].length, 1)
87+
assertEquals(1, testResult.size)
88+
assertEquals(5, testResult[0].length)
8889
assertTrue(compareSourceAndChunked(source, testResult))
90+
assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
8991
}
9092

9193
@Test fun test3ByteCharFit() {
@@ -97,27 +99,28 @@ class CharSequenceUtf8Test {
9799
assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
98100
}
99101

100-
@Test fun test3ByteCharOverflow() {
102+
@Test fun test3ByteCharFitsExactly() {
103+
// "1" (1 byte) + € (3 bytes) + null (2 bytes) = 6 bytes, fits exactly
101104
val source = "1€\u0000"
102105
val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
103-
assertEquals(testResult.size, 2)
104-
assertEquals(testResult[0].length, 2)
105-
assertEquals(testResult[1].length, 1)
106+
assertEquals(1, testResult.size)
107+
assertEquals(3, testResult[0].length)
106108
assertTrue(compareSourceAndChunked(source, testResult))
107109
assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
108110
}
109111

110112
@Test fun testNormalAscii() {
113+
// 10 ASCII chars = 10 bytes, with chunkSize=6: first 6 chars, then 4 chars
111114
val source = "0123456789"
112115
val testResult = source.chunkOnModifiedUtf8ByteSize(chunkSize)
113-
assertEquals(testResult.size, 2)
114-
assertEquals(testResult[0].length, chunkSize)
115-
assertEquals(testResult[1].length, chunkSize)
116+
assertEquals(2, testResult.size)
117+
assertEquals(6, testResult[0].length)
118+
assertEquals(4, testResult[1].length)
116119
assertTrue(compareSourceAndChunked(source, testResult))
117120
assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
118121
}
119122

120-
@Test fun testLargeRandomUTf8StringChunk5() {
123+
@Test fun testLargeRandomUTf8StringChunk6() {
121124
val testResult = random1000Utf8Chars.chunkOnModifiedUtf8ByteSize(chunkSize)
122125
assertTrue(compareSourceAndChunked(random1000Utf8Chars, testResult))
123126
assertTrue(testResult.checkModifiedUtf8ByteArraySize(chunkSize))
@@ -135,6 +138,72 @@ class CharSequenceUtf8Test {
135138
assertTrue(testResult.checkModifiedUtf8ByteArraySize(100))
136139
}
137140

141+
@Test fun testSurrogatePairKeptTogether() {
142+
// Emoji 🎉 (U+1F389) is represented as surrogate pair \uD83C\uDF89
143+
// Each surrogate is 3 bytes in Modified UTF-8, so the pair is 6 bytes
144+
val source = "a\uD83C\uDF89b" // "a🎉b"
145+
val testResult = source.chunkOnModifiedUtf8ByteSize(6)
146+
// With chunkSize=6: "a" (1 byte) + surrogate pair (6 bytes) = 7 bytes > 6
147+
// So we expect: ["a", "🎉", "b"] - surrogate pair must stay together
148+
assertEquals(3, testResult.size)
149+
assertEquals("a", testResult[0])
150+
assertEquals("\uD83C\uDF89", testResult[1]) // The emoji as a unit
151+
assertEquals("b", testResult[2])
152+
assertTrue(compareSourceAndChunked(source, testResult))
153+
// Verify each chunk is valid Unicode (no lone surrogates)
154+
testResult.forEach { chunk ->
155+
chunk.toString().toCharArray().forEachIndexed { index, char ->
156+
if (char.isHighSurrogate()) {
157+
assertTrue("High surrogate at end of chunk", index + 1 < chunk.length)
158+
assertTrue("High surrogate not followed by low surrogate", chunk[index + 1].isLowSurrogate())
159+
}
160+
if (char.isLowSurrogate()) {
161+
assertTrue("Low surrogate at start of chunk", index > 0)
162+
assertTrue("Low surrogate not preceded by high surrogate", chunk[index - 1].isHighSurrogate())
163+
}
164+
}
165+
}
166+
}
167+
168+
@Test fun testMultipleSurrogatePairs() {
169+
// Multiple emoji: 🎉🎊 (two surrogate pairs = 12 bytes)
170+
val source = "\uD83C\uDF89\uD83C\uDF8A"
171+
val testResult = source.chunkOnModifiedUtf8ByteSize(6)
172+
// Each pair is 6 bytes, so with chunkSize=6 we get 2 chunks
173+
assertEquals(2, testResult.size)
174+
assertEquals("\uD83C\uDF89", testResult[0])
175+
assertEquals("\uD83C\uDF8A", testResult[1])
176+
assertTrue(compareSourceAndChunked(source, testResult))
177+
}
178+
179+
@Test fun testSurrogatePairFitsWithOtherChars() {
180+
// "ab🎉" = 1 + 1 + 6 = 8 bytes, with chunkSize=8 should fit in one chunk
181+
val source = "ab\uD83C\uDF89"
182+
val testResult = source.chunkOnModifiedUtf8ByteSize(8)
183+
assertEquals(1, testResult.size)
184+
assertEquals(source, testResult[0])
185+
assertTrue(compareSourceAndChunked(source, testResult))
186+
}
187+
188+
@Test fun testSurrogatePairOverflow() {
189+
// "abc🎉" = 3 + 6 = 9 bytes, with chunkSize=8: "abc" then "🎉"
190+
val source = "abc\uD83C\uDF89"
191+
val testResult = source.chunkOnModifiedUtf8ByteSize(8)
192+
assertEquals(2, testResult.size)
193+
assertEquals("abc", testResult[0])
194+
assertEquals("\uD83C\uDF89", testResult[1])
195+
assertTrue(compareSourceAndChunked(source, testResult))
196+
}
197+
198+
@Test fun testSurrogatePairInMiddle() {
199+
// "a🎉b" = 1 + 6 + 1 = 8 bytes, with chunkSize=8 should fit in one chunk
200+
val source = "a\uD83C\uDF89b"
201+
val testResult = source.chunkOnModifiedUtf8ByteSize(8)
202+
assertEquals(1, testResult.size)
203+
assertEquals(source, testResult[0])
204+
assertTrue(compareSourceAndChunked(source, testResult))
205+
}
206+
138207
fun compareSourceAndChunked(
139208
source: CharSequence,
140209
chunked: List<CharSequence>,

0 commit comments

Comments
 (0)