fix(codex): keep large http_full_replay off the event loop

Mateusz · Mateusz · commit 369047031de0 · 2026-04-23T20:30:12.000+02:00
- Build continuation fingerprints in a worker thread (record_turn).
- Skip full json.dumps for oversized input/tools in INFO request logs.
- Add dedicated regression tests for the two stall paths.

Made-with: Cursor
diff --git a/src/connectors/openai_codex/continuation.py b/src/connectors/openai_codex/continuation.py
@@ -55,6 +55,20 @@ class _ContinuationEntry:
     expires_at: float
 
 
+def _build_codex_turn_snapshot(
+    normalized: str, payload_dict: dict[str, Any]
+) -> CodexContinuationSnapshot:
+    """CPU-heavy snapshot build for ``record_turn`` (may run in a worker thread)."""
+    return CodexContinuationSnapshot(
+        response_id=normalized,
+        input_fingerprints=fingerprint_input_items(payload_dict.get("input")),
+        instructions_fingerprint=fingerprint_component(
+            payload_dict.get("instructions")
+        ),
+        tools_fingerprint=fingerprint_component(payload_dict.get("tools")),
+    )
+
+
 class InMemoryCodexContinuationCoordinator(ICodexContinuationCoordinator):
     """Ephemeral TTL/LRU-ish continuation store keyed by Codex request identity."""
 
@@ -152,14 +166,12 @@ async def record_turn(
         key = self._build_key(context)
         now = time.monotonic()
 
-        # M1: Uses shared fingerprinting utilities
-        snapshot = CodexContinuationSnapshot(
-            response_id=normalized,
-            input_fingerprints=fingerprint_input_items(payload_dict.get("input")),
-            instructions_fingerprint=fingerprint_component(
-                payload_dict.get("instructions")
-            ),
-            tools_fingerprint=fingerprint_component(payload_dict.get("tools")),
+        # M1: Uses shared fingerprinting utilities. Heavy ``json.dumps`` per input item
+        # can block the event loop for large http_full_replay sessions; run off-thread.
+        snapshot = await asyncio.to_thread(
+            _build_codex_turn_snapshot,
+            normalized,
+            payload_dict,
         )
         async with self._lock:
             self._purge_expired(now)
diff --git a/src/connectors/openai_codex/executor.py b/src/connectors/openai_codex/executor.py
@@ -323,6 +323,11 @@ class ResponseExecutor(IResponseExecutor):
     - Credential refresh integration for streaming retries
     """
 
+    # INFO logging calls ``json.dumps`` on the full Codex ``input`` list; skip that
+    # work for very large histories to avoid multi-second event-loop stalls before
+    # each ``http_full_replay`` upstream request.
+    _LOG_JSON_MEASURE_MAX_INPUT_ITEMS: int = 120
+
     def __init__(
         self,
         base_connector: OpenAIConnector,
@@ -823,12 +828,12 @@ async def _streaming_iterator() -> AsyncIterator[ProcessedResponse]:
                                         context,
                                     )
                                 )
-                                if incompatible_tools:
-                                    if (
-                                        incompatible_tool_retries
-                                        < self._max_incompatible_tool_retries
-                                    ):
-                                        retry_for_incompatible_tools = True
+                                if incompatible_tools:
+                                    if (
+                                        incompatible_tool_retries
+                                        < self._max_incompatible_tool_retries
+                                    ):
+                                        retry_for_incompatible_tools = True
                                         restart_stream = True
                                         incompatible_tool_retries += 1
                                         current_payload_dict = self._append_incompatible_tool_retry_steering(
@@ -1588,9 +1593,9 @@ def _log_request_attempt(
                 "input_item_count": (
                     len(input_items) if isinstance(input_items, list) else 0
                 ),
-                "input_bytes": self._measure_json_bytes(input_items),
+                "input_bytes": self._measure_json_bytes_for_log(input_items),
                 "tools_count": len(tools) if isinstance(tools, list) else 0,
-                "tools_bytes": self._measure_json_bytes(tools),
+                "tools_bytes": self._measure_json_bytes_for_log(tools),
                 "instructions_bytes": (
                     len(instructions.encode("utf-8"))
                     if isinstance(instructions, str)
@@ -1599,6 +1604,15 @@ def _log_request_attempt(
             },
         )
 
+    def _measure_json_bytes_for_log(self, value: Any) -> int | None:
+        """Like ``_measure_json_bytes`` but skips huge lists used only for diagnostics."""
+        if (
+            isinstance(value, list)
+            and len(value) > self._LOG_JSON_MEASURE_MAX_INPUT_ITEMS
+        ):
+            return None
+        return self._measure_json_bytes(value)
+
     @staticmethod
     def _measure_json_bytes(value: Any) -> int:
         if value is None:
diff --git a/tests/unit/connectors/openai_codex/test_codex_http_full_replay_proxy_regression.py b/tests/unit/connectors/openai_codex/test_codex_http_full_replay_proxy_regression.py
@@ -0,0 +1,104 @@
+"""Regression: proxy-side stalls on long ``http_full_replay`` Codex sessions.
+
+Historically, two issues showed up together with ``Observed Codex response id`` / long
+histories:
+
+1. ``record_turn`` fingerprinted the entire ``input`` list synchronously on the event
+   loop (``json.dumps`` per item), freezing the process under large replays.
+
+2. ``ResponseExecutor._log_request_attempt`` computed ``input_bytes`` / ``tools_bytes``
+   via a full ``json.dumps`` of the entire payload for INFO logs before each upstream
+   request, which could take multiple seconds for hundreds of messages.
+
+Upstream Codex can still pause between SSE events; these tests only guard the proxy.
+"""
+
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+import pytest
+from src.connectors._openai_codex_capabilities import CodexClientCapabilities
+from src.connectors.openai_codex.continuation import (
+    InMemoryCodexContinuationCoordinator,
+    _build_codex_turn_snapshot,
+)
+from src.connectors.openai_codex.contracts import (
+    CodexRequestContext,
+    ProcessedMessage,
+)
+from src.connectors.openai_codex.executor import ResponseExecutor
+from src.core.domain.chat import CanonicalChatRequest, ChatMessage
+
+
+def _codex_context(session_id: str) -> CodexRequestContext:
+    request = CanonicalChatRequest(
+        model="gpt-5.1-codex",
+        messages=[ChatMessage(role="user", content="hello")],
+        stream=True,
+    )
+    return CodexRequestContext(
+        request=request,
+        processed_messages=[ProcessedMessage(role="user", content="hello")],
+        effective_model="gpt-5.1-codex",
+        capabilities=CodexClientCapabilities(),
+        session_id=session_id,
+        metadata={
+            "continuation_backend": "openai-codex",
+            "continuation_prompt_cache_key": "prompt-a",
+        },
+    )
+
+
+@pytest.mark.asyncio
+async def test_record_turn_fingerprints_large_input_via_worker_thread(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    coordinator = InMemoryCodexContinuationCoordinator(ttl_seconds=60, max_entries=4)
+    context = _codex_context("session-regression-large-input")
+    big_input = [
+        {"type": "message", "role": "user", "content": f"line-{i}"} for i in range(250)
+    ]
+
+    to_thread_calls: list[Any] = []
+
+    real_to_thread = asyncio.to_thread
+
+    async def _spy_to_thread(func: Any, /, *args: Any, **kwargs: Any) -> Any:
+        to_thread_calls.append(func)
+        return await real_to_thread(func, *args, **kwargs)
+
+    monkeypatch.setattr(
+        "src.connectors.openai_codex.continuation.asyncio.to_thread",
+        _spy_to_thread,
+    )
+
+    await coordinator.record_turn(
+        context,
+        response_id="resp-regression",
+        payload_dict={
+            "input": big_input,
+            "instructions": "inst",
+            "tools": [{"type": "function", "name": "read", "parameters": {}}],
+        },
+    )
+
+    assert to_thread_calls == [_build_codex_turn_snapshot]
+    snapshot = await coordinator.get_snapshot(context)
+    assert snapshot is not None
+    assert snapshot.response_id == "resp-regression"
+    assert len(snapshot.input_fingerprints) == 250
+
+
+def test_measure_json_bytes_for_log_skips_long_lists(
+    executor: ResponseExecutor,
+) -> None:
+    """INFO-only size metrics must not full-serialize huge ``input`` lists."""
+    cap = executor._LOG_JSON_MEASURE_MAX_INPUT_ITEMS
+    assert (
+        executor._measure_json_bytes_for_log([{"i": n} for n in range(cap + 1)]) is None
+    )
+    measured = executor._measure_json_bytes_for_log([{"i": n} for n in range(5)])
+    assert isinstance(measured, int)
+    assert measured > 0