From 9652f6eaa9798d5e5fc7a1b181daff3ce9a40604 Mon Sep 17 00:00:00 2001 From: Android PowerUser <88908510+Android-PowerUser@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:03:11 +0200 Subject: [PATCH 1/4] Fix screenshot permission gating by model support --- .../com/google/ai/sample/GenerativeAiViewModelFactory.kt | 1 + .../ai/sample/feature/multimodal/PhotoReasoningScreen.kt | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/app/src/main/kotlin/com/google/ai/sample/GenerativeAiViewModelFactory.kt b/app/src/main/kotlin/com/google/ai/sample/GenerativeAiViewModelFactory.kt index 092aaff..3801cc6 100644 --- a/app/src/main/kotlin/com/google/ai/sample/GenerativeAiViewModelFactory.kt +++ b/app/src/main/kotlin/com/google/ai/sample/GenerativeAiViewModelFactory.kt @@ -56,6 +56,7 @@ enum class ModelOption( ApiProvider.GOOGLE, "https://huggingface.co/na5h13/gemma-3n-E4B-it-litert-lm/resolve/main/gemma-3n-E4B-it-int4.litertlm?download=true", "4.92 GB", + supportsScreenshot = true, isOfflineModel = true, offlineModelFilename = "gemma-3n-e4b-it-int4.litertlm", offlineRequiredFilenames = listOf("gemma-3n-e4b-it-int4.litertlm") diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt index 3b60b3b..ca4f9d6 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningScreen.kt @@ -406,9 +406,10 @@ fun PhotoReasoningScreen( return@IconButton } - // Check MediaProjection for all models except offline and human-expert - // Human Expert uses its own MediaProjection for WebRTC, not ScreenCaptureService - if (!isMediaProjectionPermissionGranted && !com.google.ai.sample.GenerativeAiViewModelFactory.getCurrentModel().isOfflineModel && modelName != "human-expert") { + // Check MediaProjection only for models that support screenshots and are not human-expert. + // Human Expert uses its own MediaProjection for WebRTC, not ScreenCaptureService. + val currentModel = com.google.ai.sample.GenerativeAiViewModelFactory.getCurrentModel() + if (!isMediaProjectionPermissionGranted && currentModel.supportsScreenshot && modelName != "human-expert") { mainActivity?.requestMediaProjectionPermission { // This block will be executed after permission is granted if (userQuestion.isNotBlank()) { From 3bd9ca813660f74f8084bd44b6c4104feb6df64a Mon Sep 17 00:00:00 2001 From: Android PowerUser <88908510+Android-PowerUser@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:43:53 +0200 Subject: [PATCH 2/4] Fix takeScreenshot flow for screenshot-capable offline models --- .../google/ai/sample/ScreenOperatorAccessibilityService.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt b/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt index 6c2616b..7f0d0e8 100644 --- a/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt +++ b/app/src/main/kotlin/com/google/ai/sample/ScreenOperatorAccessibilityService.kt @@ -234,8 +234,8 @@ class ScreenOperatorAccessibilityService : AccessibilityService() { } is Command.TakeScreenshot -> { val currentModel = GenerativeAiViewModelFactory.getCurrentModel() - if (currentModel.isOfflineModel) { - Log.d(TAG, "Command.TakeScreenshot: Model is offline, capturing screen info only.") + if (!currentModel.supportsScreenshot) { + Log.d(TAG, "Command.TakeScreenshot: Model has no screenshot support, capturing screen info only.") this.showToast("Capturing screen info...", false) val screenInfo = captureScreenInformation() val mainActivity = MainActivity.getInstance() From 3ac988f8ba33001fef00dad4576693f5434fd43e Mon Sep 17 00:00:00 2001 From: Android PowerUser <88908510+Android-PowerUser@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:12:07 +0200 Subject: [PATCH 3/4] Reuse last chat input for auto-screenshot follow-ups --- .../multimodal/PhotoReasoningViewModel.kt | 24 ++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt index d94eef3..bbdda80 100644 --- a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt +++ b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt @@ -2161,7 +2161,29 @@ class PhotoReasoningViewModel( saveChatHistory(context) } - private fun createGenericScreenshotPrompt(): String = "" + private fun createGenericScreenshotPrompt(): String { + val lastUserMessage = _chatState.getAllMessages() + .asReversed() + .firstOrNull { it.participant == PhotoParticipant.USER && it.text.isNotBlank() } + ?.text + ?.trim() + + if (!lastUserMessage.isNullOrBlank()) { + return lastUserMessage + } + + val persistedInput = _userInput.value.trim() + if (persistedInput.isNotBlank()) { + return persistedInput + } + + val lastKnownInput = currentUserInput.trim() + if (lastKnownInput.isNotBlank()) { + return lastKnownInput + } + + return "" + } /** * Update the system message From dbe81520294e3761474da6690f5d7ef8a3632900 Mon Sep 17 00:00:00 2001 From: Android PowerUser <88908510+Android-PowerUser@users.noreply.github.com> Date: Mon, 13 Apr 2026 19:40:23 +0200 Subject: [PATCH 4/4] Adjust model menu ordering, warnings, and payment/system text --- .../google/ai/sample/MainActivityDialogs.kt | 9 ++- .../kotlin/com/google/ai/sample/MenuScreen.kt | 71 ++++++++++++++----- .../sample/util/SystemMessagePreferences.kt | 2 +- 3 files changed, 63 insertions(+), 19 deletions(-) diff --git a/app/src/main/kotlin/com/google/ai/sample/MainActivityDialogs.kt b/app/src/main/kotlin/com/google/ai/sample/MainActivityDialogs.kt index 92a9434..90f5ab2 100644 --- a/app/src/main/kotlin/com/google/ai/sample/MainActivityDialogs.kt +++ b/app/src/main/kotlin/com/google/ai/sample/MainActivityDialogs.kt @@ -15,6 +15,7 @@ import androidx.compose.material3.TextButton import androidx.compose.runtime.Composable import androidx.compose.ui.Alignment import androidx.compose.ui.Modifier +import androidx.compose.ui.graphics.Color import androidx.compose.ui.unit.dp import androidx.compose.ui.window.Dialog @@ -62,9 +63,13 @@ internal fun PaymentMethodDialog( Column { Button( onClick = onPayPalClick, - modifier = Modifier.fillMaxWidth().padding(bottom = 8.dp) + // Do not actually disable this button; keep click behavior enabled. + modifier = Modifier.fillMaxWidth().padding(bottom = 8.dp), + colors = androidx.compose.material3.ButtonDefaults.buttonColors( + containerColor = Color.Gray + ) ) { - Text("PayPal (2,60 €/Month)") + Text("PayPal (2,90 €/Month)") } Button( onClick = onGooglePlayClick, diff --git a/app/src/main/kotlin/com/google/ai/sample/MenuScreen.kt b/app/src/main/kotlin/com/google/ai/sample/MenuScreen.kt index 3bf5247..98051ca 100644 --- a/app/src/main/kotlin/com/google/ai/sample/MenuScreen.kt +++ b/app/src/main/kotlin/com/google/ai/sample/MenuScreen.kt @@ -98,6 +98,14 @@ data class MenuItem( val descriptionResId: Int ) +private val STRIKETHROUGH_MODELS = listOf( + ModelOption.GEMMA_3_27B_IT, + ModelOption.MISTRAL_LARGE_3, + ModelOption.GEMINI_FLASH_LIVE_PREVIEW, + ModelOption.GEMINI_FLASH_LITE_PREVIEW, + ModelOption.QWEN3_5_4B_OFFLINE +) + @Composable fun MenuScreen( innerPadding: PaddingValues, @@ -201,12 +209,33 @@ fun MenuScreen( expanded = expanded, onDismissRequest = { expanded = false } ) { - val orderedModels = ModelOption.values().toList() + val allModels = ModelOption.values().toList() + val vercelModels = allModels.filter { + it.apiProvider == ApiProvider.VERCEL && !STRIKETHROUGH_MODELS.contains(it) + } + val normalModels = allModels.filter { + it != ModelOption.MISTRAL_MEDIUM_3_1 && + it.apiProvider != ApiProvider.VERCEL && + !STRIKETHROUGH_MODELS.contains(it) + } + val orderedModels = listOf(ModelOption.MISTRAL_MEDIUM_3_1) + + normalModels + + vercelModels + + STRIKETHROUGH_MODELS orderedModels.forEach { modelOption -> DropdownMenuItem( text = { - Text(modelOption.displayName + (modelOption.size?.let { " - $it" } ?: "")) + // Do not actually disable these models. They must remain selectable for testing/debug purposes. + val itemTextStyle = if (STRIKETHROUGH_MODELS.contains(modelOption)) { + MaterialTheme.typography.bodyLarge.copy(textDecoration = TextDecoration.LineThrough) + } else { + MaterialTheme.typography.bodyLarge + } + Text( + text = modelOption.displayName + (modelOption.size?.let { " - $it" } ?: ""), + style = itemTextStyle + ) }, onClick = { expanded = false @@ -252,6 +281,26 @@ fun MenuScreen( } } } + + val modelHint = when (selectedModel) { + ModelOption.GEMMA_3_27B_IT -> "Google doesn't support screenshots in the API for this model." + ModelOption.GPT_OSS_120B -> "This is a pure text model\nCerebras sometimes discontinues free access in the Free Tier, displaying an \"Error 404: gpt-oss-120b does not exist or you do not have access to it\" message, or changes the rate limits." + ModelOption.MISTRAL_LARGE_3 -> "Mistral AI rejects requests containing non-black images with a 429 Error: Rate limit exceeded response" + ModelOption.GEMINI_3_FLASH -> "Google often rejects requests to this model with a 503 Model is exhausted error" + ModelOption.PUTER_GLM5 -> "This model is expensive and uses up the free quota quickly. Consider GPT 5.4 nano" + ModelOption.GPT_5_1_CODEX_MAX, + ModelOption.GPT_5_1_CODEX_MINI, + ModelOption.GPT_5_NANO -> "Vercel requires a credit card" + else -> "" + } + if (modelHint.isNotBlank()) { + Spacer(modifier = Modifier.height(8.dp)) + Text( + text = modelHint, + style = MaterialTheme.typography.bodyMedium, + color = MaterialTheme.colorScheme.onSurfaceVariant + ) + } } } } @@ -583,13 +632,7 @@ fun MenuScreen( withStyle(boldStyle) { append("API Keys") } append(" are automatically switched if multiple are inserted and one is exhausted.\n") - append("• ") - withStyle(boldStyle) { append("GPT-oss 120b") } - append(" is a pure text model.\n") - append("• ") - - withStyle(boldStyle) { append("Gemma 27B IT") } - append(" cannot handle screenshots in the API.\n") + append("• Models with a line through them do not work properly.\n") append("• GPT models (") withStyle(boldStyle) { append("Vercel") } append(") have a free budget of \$5 per month and a credit card is necessary.\n") @@ -597,15 +640,11 @@ fun MenuScreen( append("GPT-5.1 mini Input: \$0.25/M Output: \$2.00/M\n") append("GPT-5 nano Input: \$0.05/M Output: \$0.40/M\n") append("• When a language model repeats a token, Top K and Top P must be lowered.\n") - append("• There are ") - withStyle(boldStyle) { append("rate limits") } - append(" for free use of ") - withStyle(boldStyle) { append("Gemini models") } - append(". The less powerful the models are, the more you can use them. The limits range from a maximum of 5 to 30 calls per minute. After each screenshot (every 2-3 seconds) the LLM must respond again. More information is available at ") + append("• Google has recently significantly tightened its rate limits and is fluctuating widely with its free quota. Try it for yourself. More information is available at ") - pushStringAnnotation(tag = "URL", annotation = "https://ai.google.dev/gemini-api/docs/rate-limits") + pushStringAnnotation(tag = "URL", annotation = "https://aistudio.google.com/rate-limit") withStyle(style = SpanStyle(color = MaterialTheme.colorScheme.primary, textDecoration = TextDecoration.Underline)) { - append("https://ai.google.dev/gemini-api/docs/rate-limits") + append("https://aistudio.google.com/rate-limit") } pop() } diff --git a/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt b/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt index 71b87ce..d22db99 100644 --- a/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt +++ b/app/src/main/kotlin/com/google/ai/sample/util/SystemMessagePreferences.kt @@ -14,7 +14,7 @@ object SystemMessagePreferences { private const val KEY_FIRST_START_COMPLETED = "first_start_completed" // New flag // Content from pasted_content.txt - private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()" Your task is:""" + private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()" Your task is:""" private fun prefs(context: Context) = context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE) /**