Add adaptive Mistral backoff and raise retry budget

Android-PowerUser · Android-PowerUser · commit 6229b1f214ee · 2026-03-31T15:08:05.000+02:00
diff --git a/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt b/app/src/main/kotlin/com/google/ai/sample/feature/multimodal/PhotoReasoningViewModel.kt
@@ -71,6 +71,7 @@ import com.google.ai.sample.webrtc.WebRTCSender
 import com.google.ai.sample.webrtc.SignalingClient
 import org.webrtc.IceCandidate
 import kotlin.math.max
+import kotlin.math.roundToLong
 
 class PhotoReasoningViewModel(
     application: Application,
@@ -183,11 +184,11 @@ class PhotoReasoningViewModel(
     // to avoid re-executing already-executed commands
     private var incrementalCommandCount = 0
 
-    // Mistral rate limiting per API key (1.1 seconds between requests with same key)
+    // Mistral rate limiting per API key (1.5 seconds between requests with same key)
     private val mistralNextAllowedRequestAtMsByKey = mutableMapOf<String, Long>()
     private var lastMistralTokenTimeMs = 0L
     private var lastMistralTokenKey: String? = null
-    private val MISTRAL_MIN_INTERVAL_MS = 1100L
+    private val MISTRAL_MIN_INTERVAL_MS = 1500L
 
     // Accumulated full text during streaming for incremental command parsing
     private var streamingAccumulatedText = StringBuilder()
@@ -609,6 +610,7 @@ class PhotoReasoningViewModel(
         val currentModel = com.google.ai.sample.GenerativeAiViewModelFactory.getCurrentModel()
 
         clearStaleErrorState()
+        stopExecutionFlag.set(false)
 
         // Check for Human Expert model
         if (currentModel == ModelOption.HUMAN_EXPERT) {
@@ -1139,11 +1141,36 @@ private fun reasonWithMistral(
                 mistralNextAllowedRequestAtMsByKey[key] = max(existing, nextAllowedAt)
             }
 
+            fun markKeyCooldown(key: String, referenceTimeMs: Long, extraDelayMs: Long) {
+                val normalizedExtraDelay = extraDelayMs.coerceAtLeast(0L)
+                val nextAllowedAt = referenceTimeMs + max(MISTRAL_MIN_INTERVAL_MS, normalizedExtraDelay)
+                val existing = mistralNextAllowedRequestAtMsByKey[key] ?: 0L
+                mistralNextAllowedRequestAtMsByKey[key] = max(existing, nextAllowedAt)
+            }
+
             fun remainingWaitForKeyMs(key: String, nowMs: Long): Long {
                 val nextAllowedAt = mistralNextAllowedRequestAtMsByKey[key] ?: 0L
                 return (nextAllowedAt - nowMs).coerceAtLeast(0L)
             }
 
+            fun parseRetryAfterMs(headerValue: String?): Long? {
+                if (headerValue.isNullOrBlank()) return null
+                val seconds = headerValue.trim().toDoubleOrNull() ?: return null
+                return (seconds * 1000.0).roundToLong().coerceAtLeast(0L)
+            }
+
+            fun parseRateLimitResetDelayMs(response: okhttp3.Response, nowMs: Long): Long? {
+                val resetHeader = response.header("x-ratelimit-reset") ?: return null
+                val resetEpochSeconds = resetHeader.trim().toLongOrNull() ?: return null
+                val resetMs = resetEpochSeconds * 1000L
+                return (resetMs - nowMs).coerceAtLeast(0L)
+            }
+
+            fun adaptiveRetryDelayMs(failureCount: Int): Long {
+                val cappedExponent = (failureCount - 1).coerceIn(0, 5)
+                return 1000L shl cappedExponent // 1s, 2s, 4s, 8s, 16s, 32s
+            }
+
             fun isRetryableMistralFailure(code: Int): Boolean {
                 return code == 429 || code >= 500
             }
@@ -1153,7 +1180,7 @@ private fun reasonWithMistral(
             var consecutiveFailures = 0
             var blockedKeysThisRound = mutableSetOf<String>()
 
-            val maxAttempts = availableKeys.size * 2 + 3 // Allow cycling through all keys at least twice
+            val maxAttempts = availableKeys.size * 4 + 8
             while (response == null && consecutiveFailures < maxAttempts) {
                 if (stopExecutionFlag.get()) break
 
@@ -1175,7 +1202,10 @@ private fun reasonWithMistral(
                 try {
                     val attemptResponse = client.newCall(buildRequest(selectedKey)).execute()
                     val requestEndMs = System.currentTimeMillis()
-                    markKeyCooldown(selectedKey, requestEndMs)
+                    val retryAfterMs = parseRetryAfterMs(attemptResponse.header("Retry-After"))
+                    val resetDelayMs = parseRateLimitResetDelayMs(attemptResponse, requestEndMs)
+                    val serverRequestedDelayMs = max(retryAfterMs ?: 0L, resetDelayMs ?: 0L)
+                    markKeyCooldown(selectedKey, requestEndMs, serverRequestedDelayMs)
 
                     if (attemptResponse.isSuccessful) {
                         response = attemptResponse
@@ -1192,39 +1222,46 @@ private fun reasonWithMistral(
                     attemptResponse.close()
                     blockedKeysThisRound.add(selectedKey)
                     consecutiveFailures++
+                    val adaptiveDelay = adaptiveRetryDelayMs(consecutiveFailures)
+                    markKeyCooldown(
+                        selectedKey,
+                        requestEndMs,
+                        max(serverRequestedDelayMs, adaptiveDelay)
+                    )
                     withContext(Dispatchers.Main) {
                         replaceAiMessageText(
-                            "Mistral temporär nicht verfügbar (Versuch $consecutiveFailures/$maxAttempts). Wiederhole...",
+                            "Mistral temporär nicht verfügbar (Versuch $consecutiveFailures/$maxAttempts). Warte auf Server-Rate-Limit und wiederhole...",
                             isPending = true
                         )
                     }
                 } catch (e: IOException) {
                     val requestEndMs = System.currentTimeMillis()
-                    markKeyCooldown(selectedKey, requestEndMs)
+                    val adaptiveDelay = adaptiveRetryDelayMs(consecutiveFailures + 1)
+                    markKeyCooldown(selectedKey, requestEndMs, adaptiveDelay)
                     blockedKeysThisRound.add(selectedKey)
                     consecutiveFailures++
-                    if (consecutiveFailures >= 5) {
-                        throw IOException("Mistral request failed after 5 attempts: ${e.message}", e)
+                    if (consecutiveFailures >= maxAttempts) {
+                        throw IOException("Mistral request failed after $maxAttempts attempts: ${e.message}", e)
                     }
                     withContext(Dispatchers.Main) {
                         replaceAiMessageText(
-                        if (consecutiveFailures >= maxAttempts) {
-                            throw IOException("Mistral request failed after $maxAttempts attempts: ${e.message}", e)
+                            "Mistral Netzwerkfehler (Versuch $consecutiveFailures/$maxAttempts). Wiederhole...",
+                            isPending = true
                         )
                     }
                 }
-                                "Mistral Netzwerkfehler (Versuch $consecutiveFailures/$maxAttempts). Wiederhole...",
+            }
 
             if (stopExecutionFlag.get()) {
                 throw IOException("Mistral request aborted.")
             }
 
-            val finalResponse = response ?: throw IOException("Mistral request failed after 5 attempts.")
+            val finalResponse = response ?: throw IOException("Mistral request failed after $maxAttempts attempts.")
 
             if (!finalResponse.isSuccessful) {
                 val errBody = finalResponse.body?.string()
                 finalResponse.close()
-            val finalResponse = response ?: throw IOException("Mistral request failed after $maxAttempts attempts.")
+                throw IOException("Mistral Error ${finalResponse.code}: $errBody")
             }
 
             val body = finalResponse.body ?: throw IOException("Empty response body from Mistral")
diff --git a/scripts/mistral_cooldown_probe.py b/scripts/mistral_cooldown_probe.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+import json
+import subprocess
+import time
+from typing import Tuple, List
+
+MISTRAL_API_KEY = "zsEegAJFadHH4uooe2lW0HVNmy1rpqGT"
+MISTRAL_MODEL = "mistral-large-latest"
+MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/chat/completions"
+
+
+def now_ms() -> int:
+    return int(time.time() * 1000)
+
+
+def curl_chat(payload: dict, stream: bool) -> Tuple[int, int, int]:
+    """
+    Returns: (http_code, request_started_ms, last_token_ms_or_response_end_ms)
+    For non-stream requests, 3rd value is response-end timestamp.
+    """
+    request_started = now_ms()
+    cmd = [
+        "curl",
+        "-sS",
+        "-X",
+        "POST",
+        MISTRAL_ENDPOINT,
+        "-H",
+        "Content-Type: application/json",
+        "-H",
+        f"Authorization: Bearer {MISTRAL_API_KEY}",
+        "--data-binary",
+        json.dumps(payload),
+        "-w",
+        "\nHTTP_STATUS:%{http_code}\n",
+    ]
+    if stream:
+        cmd.insert(1, "-N")
+
+    proc = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+    )
+
+    last_token_ms = request_started
+    http_code = 0
+    assert proc.stdout is not None
+    for line in proc.stdout:
+        line = line.rstrip("\n")
+        if line.startswith("data:"):
+            data = line[5:].strip()
+            if data and data != "[DONE]":
+                last_token_ms = now_ms()
+        elif line.startswith("HTTP_STATUS:"):
+            try:
+                http_code = int(line.split(":", 1)[1].strip())
+            except ValueError:
+                http_code = 0
+
+    exit_code = proc.wait()
+    if exit_code != 0:
+        raise RuntimeError(f"curl failed with exit code {exit_code}")
+
+    if not stream:
+        last_token_ms = now_ms()
+    return http_code, request_started, last_token_ms
+
+
+def sleep_until(target_ms: int) -> None:
+    remaining = target_ms - now_ms()
+    if remaining > 0:
+        time.sleep(remaining / 1000.0)
+
+
+def probe_last_token_mode(delays: List[int]) -> None:
+    print("=== PROBE: ab_letztem_token ===")
+    min_success = None
+    for delay in delays:
+        stream_payload = {
+            "model": MISTRAL_MODEL,
+            "messages": [{"role": "user", "content": "Sag nur OK."}],
+            "max_tokens": 32,
+            "stream": True,
+        }
+        code, _, last_token = curl_chat(stream_payload, stream=True)
+        if code != 200:
+            print(f"baseline_stream_failed http={code}")
+            continue
+
+        sleep_until(last_token + delay)
+        probe_payload = {
+            "model": MISTRAL_MODEL,
+            "messages": [{"role": "user", "content": "OK?"}],
+            "max_tokens": 1,
+            "stream": False,
+        }
+        probe_code, _, _ = curl_chat(probe_payload, stream=False)
+        print(f"delay={delay}ms http={probe_code}")
+        if min_success is None and probe_code == 200:
+            min_success = delay
+    print(f"min_success_delay_ms={min_success}")
+    print()
+
+
+def probe_request_start_mode(delays: List[int]) -> None:
+    print("=== PROBE: ab_request_start ===")
+    min_success = None
+    for delay in delays:
+        baseline_payload = {
+            "model": MISTRAL_MODEL,
+            "messages": [{"role": "user", "content": "Sag nur OK."}],
+            "max_tokens": 32,
+            "stream": True,
+        }
+        request_started = now_ms()
+        baseline_cmd = [
+            "curl",
+            "-sS",
+            "-N",
+            "-X",
+            "POST",
+            MISTRAL_ENDPOINT,
+            "-H",
+            "Content-Type: application/json",
+            "-H",
+            f"Authorization: Bearer {MISTRAL_API_KEY}",
+            "--data-binary",
+            json.dumps(baseline_payload),
+            "-w",
+            "\nHTTP_STATUS:%{http_code}\n",
+        ]
+        baseline_proc = subprocess.Popen(
+            baseline_cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+            bufsize=1,
+        )
+
+        sleep_until(request_started + delay)
+        probe_payload = {
+            "model": MISTRAL_MODEL,
+            "messages": [{"role": "user", "content": "OK?"}],
+            "max_tokens": 1,
+            "stream": False,
+        }
+        probe_code, _, _ = curl_chat(probe_payload, stream=False)
+        print(f"delay={delay}ms http={probe_code}")
+        if min_success is None and probe_code == 200:
+            min_success = delay
+
+        baseline_output, _ = baseline_proc.communicate()
+        baseline_status = 0
+        for line in baseline_output.splitlines():
+            if line.startswith("HTTP_STATUS:"):
+                try:
+                    baseline_status = int(line.split(":", 1)[1].strip())
+                except ValueError:
+                    baseline_status = 0
+        if baseline_status != 200:
+            print(f"baseline_stream_failed http={baseline_status}")
+    print(f"min_success_delay_ms={min_success}")
+    print()
+
+
+if __name__ == "__main__":
+    step_delays = list(range(100, 3001, 100))
+    probe_last_token_mode(step_delays)
+    probe_request_start_mode(step_delays)
diff --git a/scripts/mistral_cooldown_probe.sh b/scripts/mistral_cooldown_probe.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+exec python3 "$SCRIPT_DIR/mistral_cooldown_probe.py"