@@ -170,7 +170,7 @@ class GemmaEngine(private val context: Context) {
170170 continuation.resume(" (・_・ヾ I... have no words" )
171171 }
172172 } else {
173- val cleaned = truncateRepetition(response)
173+ val cleaned = truncateRepetition(decodeHexTokens( response) )
174174 Timber .d(" Response complete: ${cleaned.take(50 )} ..." )
175175 if (continuation.isActive) {
176176 continuation.resume(cleaned)
@@ -252,7 +252,7 @@ class GemmaEngine(private val context: Context) {
252252 }
253253
254254 override fun onDone () {
255- onComplete(truncateRepetition(fullResponse))
255+ onComplete(truncateRepetition(decodeHexTokens( fullResponse) ))
256256 }
257257
258258 override fun onError (throwable : Throwable ) {
@@ -406,7 +406,7 @@ class GemmaEngine(private val context: Context) {
406406 if (continuation.isActive) {
407407 continuation.resume(
408408 if (response.isBlank()) " (empty response)"
409- else truncateRepetition(response)
409+ else truncateRepetition(decodeHexTokens( response) )
410410 )
411411 }
412412 }
@@ -435,6 +435,25 @@ class GemmaEngine(private val context: Context) {
435435 }
436436 }
437437
438+ /* *
439+ * Decode literal byte tokens (e.g. <0xF0><0x9F><...>) into proper UTF-8 strings.
440+ * This fixes instances where LiteRT-LM's detokenizer fails on emojis or non-ascii sequences.
441+ */
442+ private fun decodeHexTokens (response : String ): String {
443+ val regex = """ (<0x[0-9A-Fa-f]{2}>)+""" .toRegex()
444+ return regex.replace(response) { match ->
445+ try {
446+ val hexTokens = match.value.split(" <0x" )
447+ .filter { it.isNotBlank() }
448+ .map { it.replace(" >" , " " ) }
449+ val bytes = hexTokens.map { it.toInt(16 ).toByte() }.toByteArray()
450+ String (bytes, Charsets .UTF_8 )
451+ } catch (e: Exception ) {
452+ match.value
453+ }
454+ }
455+ }
456+
438457 fun cleanup () {
439458 runCatching {
440459 synchronized(sessionLock) {
0 commit comments