Detect rate limits in first SSE chunk for composite failover.

Mateusz · Mateusz · commit 23cd45d0e5b6 · 2026-04-15T15:24:41.000+02:00
OpenAI-compatible streams sometimes return HTTP 200 with an error object that uses string codes or types (e.g. rate_limit_exceeded, usage_limit_reached) instead of numeric status_code 429. The streaming envelope then stayed at 200, so BackendCompletionFlow skipped terminal-error handling and weighted composite rerolls never ran.

Extend first-chunk status inference to classify those payloads as 429 and add regression tests for the extractor and integrate_streaming_pipeline.

Made-with: Cursor
diff --git a/src/core/ports/streaming_integration.py b/src/core/ports/streaming_integration.py
@@ -10,7 +10,7 @@
 import json
 import logging
 from collections.abc import AsyncIterator
-from typing import cast
+from typing import Any, cast
 
 from src.core.common.exceptions import LLMProxyError, RateLimitExceededError
 from src.core.domain.responses import StreamingResponseEnvelope
@@ -45,6 +45,53 @@
 
 logger = logging.getLogger(__name__)
 
+# String / type tokens that indicate a rate-limited first SSE frame when HTTP status
+# was already 200 (some OpenAI-compatible gateways stream errors as SSE only).
+_RATE_LIMIT_ERROR_CODE_TOKENS: frozenset[str] = frozenset(
+    {
+        "rate_limit_exceeded",
+        "too_many_requests",
+        "requests_per_minute_exceeded",
+        "rpm_limit_exceeded",
+        "tpm_limit_exceeded",
+        "tenant_rate_limited",
+        "usage_limit_reached",
+    }
+)
+
+
+def _error_payload_implies_rate_limit(payload: dict[str, Any]) -> bool:
+    """Return True when a JSON object (often ``error``) describes a rate limit."""
+    err_type = payload.get("type")
+    if isinstance(err_type, str) and err_type.strip():
+        lowered = err_type.strip().lower()
+        if lowered in _RATE_LIMIT_ERROR_CODE_TOKENS:
+            return True
+        if "rate" in lowered and "limit" in lowered:
+            return True
+
+    code = payload.get("code")
+    if isinstance(code, int) and code == 429:
+        return True
+    if isinstance(code, float) and code.is_integer() and int(code) == 429:
+        return True
+    if isinstance(code, str) and code.strip():
+        lowered = code.strip().lower()
+        if lowered in _RATE_LIMIT_ERROR_CODE_TOKENS or lowered == "429":
+            return True
+
+    sc = payload.get("status_code")
+    if isinstance(sc, int) and sc == 429:
+        return True
+    if isinstance(sc, float) and sc.is_integer() and int(sc) == 429:
+        return True
+    if isinstance(sc, str):
+        stripped = sc.strip()
+        if stripped.isdigit() and int(stripped) == 429:
+            return True
+
+    return False
+
 
 def _try_extract_http_status_from_first_sse_chunk(first_chunk: bytes) -> int | None:
     """Best-effort: extract HTTP-like status from an SSE error chunk.
@@ -56,6 +103,7 @@ def _try_extract_http_status_from_first_sse_chunk(first_chunk: bytes) -> int | N
     Expected formats:
     - data: {"choices": [...], "error": {"status_code": 404, ...}}
     - data: {"choices": [{"finish_reason": "error"}], ...}
+    - data: {"error": {"code": "rate_limit_exceeded", ...}}  (no numeric status)
     """
     try:
         text = first_chunk.decode("utf-8", errors="ignore")
@@ -91,6 +139,14 @@ def _try_extract_http_status_from_first_sse_chunk(first_chunk: bytes) -> int | N
                 return code
             if isinstance(code, float) and code.is_integer():
                 return int(code)
+            if _error_payload_implies_rate_limit(err):
+                return 429
+        elif isinstance(err, str) and "rate" in err.lower() and "limit" in err.lower():
+            return 429
+
+        if _error_payload_implies_rate_limit(obj):
+            return 429
+
         # Fallback: if it looks like an OpenAI error chunk, treat as 500.
         choices = obj.get("choices")
         if isinstance(choices, list) and choices:
diff --git a/tests/unit/core/ports/test_streaming_error_propagation.py b/tests/unit/core/ports/test_streaming_error_propagation.py
@@ -17,7 +17,10 @@
     StreamingErrorMapper,
     handle_streaming_error,
 )
-from src.core.ports.streaming_integration import integrate_streaming_pipeline
+from src.core.ports.streaming_integration import (
+    _try_extract_http_status_from_first_sse_chunk,
+    integrate_streaming_pipeline,
+)
 
 
 class TestStreamingContentErrorChunks:
@@ -299,6 +302,61 @@ async def failing_raw_stream():
         headers = cast(dict[str, Any], detail["headers"])
         assert headers["retry-after"] == "7"
 
+    def test_extract_status_from_first_sse_string_rate_limit_code(self) -> None:
+        """String error.code (no numeric status) must classify as HTTP 429 for failover."""
+        payload = (
+            'data: {"error":{"type":"rate_limit_exceeded",'
+            '"code":"rate_limit_exceeded","message":"RPM"}}\n\n'
+        )
+        assert _try_extract_http_status_from_first_sse_chunk(payload.encode()) == 429
+
+    def test_extract_status_from_first_sse_usage_limit_reached_type(self) -> None:
+        """Codex-style usage_limit_reached must map to 429 for downstream recovery."""
+        payload = (
+            'data: {"error":{"type":"usage_limit_reached",'
+            '"message":"The usage limit has been reached"}}\n\n'
+        )
+        assert _try_extract_http_status_from_first_sse_chunk(payload.encode()) == 429
+
+    def test_extract_status_from_first_sse_string_status_code_429(self) -> None:
+        payload = 'data: {"error":{"status_code":"429","message":"slow down"}}\n\n'
+        assert _try_extract_http_status_from_first_sse_chunk(payload.encode()) == 429
+
+    @pytest.mark.asyncio
+    async def test_integrate_streaming_pipeline_first_sse_string_rate_limit_status_429(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """When the first SSE frame is a string-coded rate limit, envelope HTTP status is 429."""
+
+        rate_chunk = (
+            b'data: {"error":{"code":"rate_limit_exceeded","message":"RPM"}}\n\n'
+        )
+
+        class _Pipeline:
+            async def process_stream(self, *args, **kwargs):
+                yield rate_chunk
+
+        monkeypatch.setattr(
+            "src.core.ports.streaming_integration.create_pipeline_for_provider",
+            lambda *args, **kwargs: _Pipeline(),
+        )
+
+        async def raw_stream():
+            if False:
+                yield b""
+
+        envelope = await integrate_streaming_pipeline(
+            raw_stream(),
+            provider="openai",
+            stream_id="sse-string-rl",
+            enable_loop_detection=False,
+            enable_tool_call_repair=False,
+            enable_think_tags=False,
+        )
+
+        assert envelope.status_code == 429
+        assert envelope.content is not None
+
     @pytest.mark.asyncio
     async def test_integrate_streaming_pipeline_maps_early_429(
         self, monkeypatch