Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ enum class ModelOption(
ApiProvider.GOOGLE,
"https://huggingface.co/na5h13/gemma-3n-E4B-it-litert-lm/resolve/main/gemma-3n-E4B-it-int4.litertlm?download=true",
"4.92 GB",
supportsScreenshot = true,
isOfflineModel = true,
offlineModelFilename = "gemma-3n-e4b-it-int4.litertlm",
offlineRequiredFilenames = listOf("gemma-3n-e4b-it-int4.litertlm")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import androidx.compose.material3.TextButton
import androidx.compose.runtime.Composable
import androidx.compose.ui.Alignment
import androidx.compose.ui.Modifier
import androidx.compose.ui.graphics.Color
import androidx.compose.ui.unit.dp
import androidx.compose.ui.window.Dialog

Expand Down Expand Up @@ -62,9 +63,13 @@ internal fun PaymentMethodDialog(
Column {
Button(
onClick = onPayPalClick,
modifier = Modifier.fillMaxWidth().padding(bottom = 8.dp)
// Do not actually disable this button; keep click behavior enabled.
modifier = Modifier.fillMaxWidth().padding(bottom = 8.dp),
colors = androidx.compose.material3.ButtonDefaults.buttonColors(
containerColor = Color.Gray
)
) {
Text("PayPal (2,60 €/Month)")
Text("PayPal (2,90 €/Month)")
}
Button(
onClick = onGooglePlayClick,
Expand Down
71 changes: 55 additions & 16 deletions app/src/main/kotlin/com/google/ai/sample/MenuScreen.kt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,14 @@ data class MenuItem(
val descriptionResId: Int
)

private val STRIKETHROUGH_MODELS = listOf(
ModelOption.GEMMA_3_27B_IT,
ModelOption.MISTRAL_LARGE_3,
ModelOption.GEMINI_FLASH_LIVE_PREVIEW,
ModelOption.GEMINI_FLASH_LITE_PREVIEW,
ModelOption.QWEN3_5_4B_OFFLINE
)

@Composable
fun MenuScreen(
innerPadding: PaddingValues,
Expand Down Expand Up @@ -201,12 +209,33 @@ fun MenuScreen(
expanded = expanded,
onDismissRequest = { expanded = false }
) {
val orderedModels = ModelOption.values().toList()
val allModels = ModelOption.values().toList()
val vercelModels = allModels.filter {
it.apiProvider == ApiProvider.VERCEL && !STRIKETHROUGH_MODELS.contains(it)
}
val normalModels = allModels.filter {
it != ModelOption.MISTRAL_MEDIUM_3_1 &&
it.apiProvider != ApiProvider.VERCEL &&
!STRIKETHROUGH_MODELS.contains(it)
}
val orderedModels = listOf(ModelOption.MISTRAL_MEDIUM_3_1) +
normalModels +
vercelModels +
STRIKETHROUGH_MODELS

orderedModels.forEach { modelOption ->
DropdownMenuItem(
text = {
Text(modelOption.displayName + (modelOption.size?.let { " - $it" } ?: ""))
// Do not actually disable these models. They must remain selectable for testing/debug purposes.
val itemTextStyle = if (STRIKETHROUGH_MODELS.contains(modelOption)) {
MaterialTheme.typography.bodyLarge.copy(textDecoration = TextDecoration.LineThrough)
} else {
MaterialTheme.typography.bodyLarge
}
Text(
text = modelOption.displayName + (modelOption.size?.let { " - $it" } ?: ""),
style = itemTextStyle
)
},
onClick = {
expanded = false
Expand Down Expand Up @@ -252,6 +281,26 @@ fun MenuScreen(
}
}
}

val modelHint = when (selectedModel) {
ModelOption.GEMMA_3_27B_IT -> "Google doesn't support screenshots in the API for this model."
ModelOption.GPT_OSS_120B -> "This is a pure text model\nCerebras sometimes discontinues free access in the Free Tier, displaying an \"Error 404: gpt-oss-120b does not exist or you do not have access to it\" message, or changes the rate limits."
ModelOption.MISTRAL_LARGE_3 -> "Mistral AI rejects requests containing non-black images with a 429 Error: Rate limit exceeded response"
ModelOption.GEMINI_3_FLASH -> "Google often rejects requests to this model with a 503 Model is exhausted error"
ModelOption.PUTER_GLM5 -> "This model is expensive and uses up the free quota quickly. Consider GPT 5.4 nano"
ModelOption.GPT_5_1_CODEX_MAX,
ModelOption.GPT_5_1_CODEX_MINI,
ModelOption.GPT_5_NANO -> "Vercel requires a credit card"
else -> ""
}
if (modelHint.isNotBlank()) {
Spacer(modifier = Modifier.height(8.dp))
Text(
text = modelHint,
style = MaterialTheme.typography.bodyMedium,
color = MaterialTheme.colorScheme.onSurfaceVariant
)
}
}
}
}
Expand Down Expand Up @@ -583,29 +632,19 @@ fun MenuScreen(
withStyle(boldStyle) { append("API Keys") }
append(" are automatically switched if multiple are inserted and one is exhausted.\n")

append("• ")
withStyle(boldStyle) { append("GPT-oss 120b") }
append(" is a pure text model.\n")
append("• ")

withStyle(boldStyle) { append("Gemma 27B IT") }
append(" cannot handle screenshots in the API.\n")
append("• Models with a line through them do not work properly.\n")
append("• GPT models (")
withStyle(boldStyle) { append("Vercel") }
append(") have a free budget of \$5 per month and a credit card is necessary.\n")
append("GPT-5.1 Input: \$1.25/M Output: \$10.00/M\n")
append("GPT-5.1 mini Input: \$0.25/M Output: \$2.00/M\n")
append("GPT-5 nano Input: \$0.05/M Output: \$0.40/M\n")
append("• When a language model repeats a token, Top K and Top P must be lowered.\n")
append("• There are ")
withStyle(boldStyle) { append("rate limits") }
append(" for free use of ")
withStyle(boldStyle) { append("Gemini models") }
append(". The less powerful the models are, the more you can use them. The limits range from a maximum of 5 to 30 calls per minute. After each screenshot (every 2-3 seconds) the LLM must respond again. More information is available at ")
append("• Google has recently significantly tightened its rate limits and is fluctuating widely with its free quota. Try it for yourself. More information is available at ")

pushStringAnnotation(tag = "URL", annotation = "https://ai.google.dev/gemini-api/docs/rate-limits")
pushStringAnnotation(tag = "URL", annotation = "https://aistudio.google.com/rate-limit")
withStyle(style = SpanStyle(color = MaterialTheme.colorScheme.primary, textDecoration = TextDecoration.Underline)) {
append("https://ai.google.dev/gemini-api/docs/rate-limits")
append("https://aistudio.google.com/rate-limit")
}
pop()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -234,8 +234,8 @@ class ScreenOperatorAccessibilityService : AccessibilityService() {
}
is Command.TakeScreenshot -> {
val currentModel = GenerativeAiViewModelFactory.getCurrentModel()
if (currentModel.isOfflineModel) {
Log.d(TAG, "Command.TakeScreenshot: Model is offline, capturing screen info only.")
if (!currentModel.supportsScreenshot) {
Log.d(TAG, "Command.TakeScreenshot: Model has no screenshot support, capturing screen info only.")
this.showToast("Capturing screen info...", false)
val screenInfo = captureScreenInformation()
val mainActivity = MainActivity.getInstance()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,9 +406,10 @@ fun PhotoReasoningScreen(
return@IconButton
}

// Check MediaProjection for all models except offline and human-expert
// Human Expert uses its own MediaProjection for WebRTC, not ScreenCaptureService
if (!isMediaProjectionPermissionGranted && !com.google.ai.sample.GenerativeAiViewModelFactory.getCurrentModel().isOfflineModel && modelName != "human-expert") {
// Check MediaProjection only for models that support screenshots and are not human-expert.
// Human Expert uses its own MediaProjection for WebRTC, not ScreenCaptureService.
val currentModel = com.google.ai.sample.GenerativeAiViewModelFactory.getCurrentModel()
if (!isMediaProjectionPermissionGranted && currentModel.supportsScreenshot && modelName != "human-expert") {
mainActivity?.requestMediaProjectionPermission {
// This block will be executed after permission is granted
if (userQuestion.isNotBlank()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2161,7 +2161,29 @@ class PhotoReasoningViewModel(
saveChatHistory(context)
}

private fun createGenericScreenshotPrompt(): String = ""
private fun createGenericScreenshotPrompt(): String {
val lastUserMessage = _chatState.getAllMessages()
.asReversed()
.firstOrNull { it.participant == PhotoParticipant.USER && it.text.isNotBlank() }
?.text
?.trim()

if (!lastUserMessage.isNullOrBlank()) {
return lastUserMessage
}

val persistedInput = _userInput.value.trim()
if (persistedInput.isNotBlank()) {
return persistedInput
}

val lastKnownInput = currentUserInput.trim()
if (lastKnownInput.isNotBlank()) {
return lastKnownInput
}

return ""
}

/**
* Update the system message
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ object SystemMessagePreferences {
private const val KEY_FIRST_START_COMPLETED = "first_start_completed" // New flag

// Content from pasted_content.txt
private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()" Your task is:"""
private const val DEFAULT_SYSTEM_MESSAGE_ON_FIRST_START = """You are on an App on a Smartphone. Your app is called Screen Operator. You start from this app. Proceed step by step! DON'T USE TOOL CODE! You must operate the screen with exactly following commands: "home()" "back()" "recentApps()" "openApp("sample")" for buttons and words: "click("sample")" "longClick("sample")" "tapAtCoordinates(x, y)" "tapAtCoordinates(x percent of screen%, y percent of screen%)" "scrollDown()" "scrollUp()" "scrollLeft()" "scrollRight()" "scrollDown(x, y, how much pixel to scroll, duration in milliseconds)" "scrollUp(x, y, how much pixel to scroll, duration in milliseconds)" "scrollLeft(x, y, how much pixel to scroll, duration in milliseconds)" "scrollRight(x, y, how much pixel to scroll, duration in milliseconds)" "scrollDown(x percent of screen%, y percent of screen%, how much percent to scroll%, duration in milliseconds)" "scrollUp(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollLeft(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" "scrollRight(x percent of screen%, y percent of screen%, how much percent to scroll, duration in milliseconds)" scroll status bar down: "scrollUp(540, 0, 1100, 50)" "takeScreenshot()" To write text, search and click the textfield thereafter: "writeText("sample text")" You need to write the already existing text, if it should continue exist. If the keyboard is displayed, you can press "Enter()". Otherwise, you have to open the keyboard by clicking on the text field. Don't write the commands if you're just planing about it or messaging me. You can see the screen and get additional Informations about them with: "takeScreenshot()" You need this command at the end of every message until you are finish. When you're done don't say "takeScreenshot()" Your task is:"""
private fun prefs(context: Context) = context.getSharedPreferences(PREFS_NAME, Context.MODE_PRIVATE)

/**
Expand Down
Loading