Retry incompatible Codex tool calls after text output

Mateusz · Mateusz · commit 36775d4c548d · 2026-04-23T16:11:46.000+02:00
diff --git a/src/connectors/openai_codex/executor.py b/src/connectors/openai_codex/executor.py
@@ -823,12 +823,12 @@ async def _streaming_iterator() -> AsyncIterator[ProcessedResponse]:
                                         context,
                                     )
                                 )
-                                if incompatible_tools and not visible_output_emitted:
-                                    if (
-                                        incompatible_tool_retries
-                                        < self._max_incompatible_tool_retries
-                                    ):
-                                        retry_for_incompatible_tools = True
+                                if incompatible_tools:
+                                    if (
+                                        incompatible_tool_retries
+                                        < self._max_incompatible_tool_retries
+                                    ):
+                                        retry_for_incompatible_tools = True
                                         restart_stream = True
                                         incompatible_tool_retries += 1
                                         current_payload_dict = self._append_incompatible_tool_retry_steering(
diff --git a/tests/unit/connectors/openai_codex/test_executor_streaming.py b/tests/unit/connectors/openai_codex/test_executor_streaming.py
@@ -345,10 +345,10 @@ async def test_execute_streaming_handshake_maps_instruction_invalid_error(
                 pass
 
         assert exc_info.value.status_code == 400
-        assert isinstance(exc_info.value.detail, dict)
-        detail = exc_info.value.detail
-        assert detail.get("error") == "codex_instructions_invalid"
-        assert "prompt_mode" in str(detail.get("suggestion", ""))
+        assert isinstance(exc_info.value.detail, dict)
+        detail = exc_info.value.detail
+        assert detail.get("error") == "codex_instructions_invalid"
+        assert "prompt_mode" in str(detail.get("suggestion", ""))
 
     @pytest.mark.asyncio
     async def test_execute_streaming_handshake_uses_retry_after_from_error_detail(
@@ -536,11 +536,11 @@ async def test_execute_streaming_handshake_429_usage_limit_notifies_when_rotatio
 
         assert exc_info.value.status_code == 429
         mock_credential_manager.notify_codex_usage_limit_unrecovered.assert_awaited_once()
-        await_args = (
-            mock_credential_manager.notify_codex_usage_limit_unrecovered.await_args
-        )
-        assert await_args is not None
-        notify_kw = cast(dict[str, Any], await_args.kwargs)
+        await_args = (
+            mock_credential_manager.notify_codex_usage_limit_unrecovered.await_args
+        )
+        assert await_args is not None
+        notify_kw = cast(dict[str, Any], await_args.kwargs)
         assert notify_kw["upstream_detail"] == detail
         assert notify_kw["pool_exhaustion_confirmed"] is True
 
@@ -663,7 +663,7 @@ async def test_execute_streaming_handshake_auth_retry_exhausted(
 
         # Exception is raised when consuming the stream
         assert result.content is not None
-        content = result.content
+        content = result.content
         with pytest.raises(HTTPException) as exc_info:
             async for _ in content:
                 pass
@@ -928,7 +928,7 @@ async def auth_error_iterator():
 
         # Should raise after retries exhausted
         assert result.content is not None
-        content = result.content
+        content = result.content
         with pytest.raises(HTTPException) as exc_info:
             async for _ in content:
                 pass
@@ -962,7 +962,7 @@ async def test_execute_streaming_refresh_fails(
 
         # Exception is raised when consuming the stream after refresh fails
         assert result.content is not None
-        content = result.content
+        content = result.content
         with pytest.raises(HTTPException) as exc_info:
             async for _ in content:
                 pass
@@ -1111,6 +1111,84 @@ async def streaming_side_effect(
         assert matching[-1].retry_reason == "incompatible_tools"
         assert matching[-1].response_id == "resp_retry_123"
 
+    async def test_execute_streaming_retries_incompatible_tool_call_after_text_output(
+        self, executor, sample_context, streaming_payload
+    ) -> None:
+        """Incompatible tool retries should still fire even after brief text output."""
+        compatibility_layer = MagicMock(spec=ICompatibilityLayer)
+        compatibility_layer.detect_incompatible_tool_calls.return_value = [
+            "apply_patch"
+        ]
+        compatibility_layer.append_incompatible_tool_steering.side_effect = (
+            lambda payload_dict, incompatible_tools, context: {
+                **payload_dict,
+                "instructions": "retry steering",
+            }
+        )
+        executor._compatibility_layer = compatibility_layer
+
+        first_handle = MagicMock()
+        first_handle.headers = {}
+        first_handle.cancel_callback = AsyncMock()
+
+        async def first_iterator():
+            yield ProcessedResponse(
+                content={"choices": [{"delta": {"content": "Working on it."}}]},
+                metadata={},
+            )
+            yield ProcessedResponse(
+                content={
+                    "type": "response.output_item.added",
+                    "item": {"type": "function_call", "name": "apply_patch"},
+                }
+            )
+
+        first_handle.iterator = first_iterator()
+
+        second_handle = MagicMock()
+        second_handle.headers = {}
+        second_handle.cancel_callback = AsyncMock()
+
+        async def second_iterator():
+            yield ProcessedResponse(
+                content={"choices": [{"delta": {"content": "Using native edit."}}]},
+                metadata={},
+            )
+
+        second_handle.iterator = second_iterator()
+
+        captured_payloads: list[dict[str, object]] = []
+
+        async def streaming_side_effect(
+            url, payload_dict, headers, session_id, *args, **kwargs
+        ):
+            captured_payloads.append(dict(payload_dict))
+            if len(captured_payloads) == 1:
+                return first_handle
+            return second_handle
+
+        executor._base_connector._handle_streaming_response = AsyncMock(
+            side_effect=streaming_side_effect
+        )
+
+        result = await executor.execute(streaming_payload, sample_context)
+        chunks = [
+            chunk
+            async for chunk in cast(AsyncIterator[ProcessedResponse], result.content)
+        ]
+
+        assert len(chunks) == 2
+        assert chunks[0].content == {
+            "choices": [{"delta": {"content": "Working on it."}}]
+        }
+        assert chunks[1].content == {
+            "choices": [{"delta": {"content": "Using native edit."}}]
+        }
+        assert len(captured_payloads) == 2
+        assert captured_payloads[1]["instructions"] == "retry steering"
+        first_handle.cancel_callback.assert_awaited()
+        compatibility_layer.append_incompatible_tool_steering.assert_called_once()
+
     async def test_conversation_id_preserved_across_streaming_retries(
         self,
         executor,
@@ -2016,7 +2094,7 @@ async def handle_streaming_side_effect(
 
         first_result = await executor.execute(first_payload, sample_context)
         assert first_result.content is not None
-        first_stream = first_result.content
+        first_stream = first_result.content
         first_chunk = await anext(first_stream)
         assert isinstance(first_chunk, ProcessedResponse)
         await cast(Any, first_stream).aclose()
@@ -2456,10 +2534,10 @@ async def test_normalize_processed_stream_chunk_marks_tool_call_emission(
         assert content["choices"][0]["finish_reason"] == "tool_calls"
 
     @pytest.mark.asyncio
-    async def test_normalize_processed_stream_chunk_marks_function_call_done_as_tool_output(
-        self,
-        mock_base_connector,
-        mock_credential_manager,
+    async def test_normalize_processed_stream_chunk_marks_function_call_done_as_tool_output(
+        self,
+        mock_base_connector,
+        mock_credential_manager,
     ) -> None:
         mock_base_connector.translation_service = TranslationService()
         executor = ResponseExecutor(
@@ -2479,39 +2557,39 @@ async def test_normalize_processed_stream_chunk_marks_function_call_done_as_tool
         normalized = executor._normalize_processed_stream_chunk(chunk)
 
         assert normalized.metadata.get("tool_call_emitted") is True
-        assert normalized.metadata.get("finish_reason") == "tool_calls"
-        content = cast(dict[str, Any], normalized.content)
-        assert content["choices"][0]["delta"] == {}
-
-    @pytest.mark.asyncio
-    async def test_normalize_processed_stream_chunk_overrides_falsey_tool_markers(
-        self,
-        mock_base_connector,
-        mock_credential_manager,
-    ) -> None:
-        mock_base_connector.translation_service = TranslationService()
-        executor = ResponseExecutor(
-            mock_base_connector,
-            mock_credential_manager,
-        )
-
-        chunk = ProcessedResponse(
-            content={
-                "type": "response.function_call_arguments.done",
-                "item_id": "fc_ws_tool",
-                "arguments": '{"command":["bash","-lc","git status --short"]}',
-            },
-            metadata={
-                "event_type": "response.function_call_arguments.done",
-                "tool_call_emitted": False,
-                "finish_reason": None,
-            },
-        )
-
-        normalized = executor._normalize_processed_stream_chunk(chunk)
-
-        assert normalized.metadata.get("tool_call_emitted") is True
-        assert normalized.metadata.get("finish_reason") == "tool_calls"
+        assert normalized.metadata.get("finish_reason") == "tool_calls"
+        content = cast(dict[str, Any], normalized.content)
+        assert content["choices"][0]["delta"] == {}
+
+    @pytest.mark.asyncio
+    async def test_normalize_processed_stream_chunk_overrides_falsey_tool_markers(
+        self,
+        mock_base_connector,
+        mock_credential_manager,
+    ) -> None:
+        mock_base_connector.translation_service = TranslationService()
+        executor = ResponseExecutor(
+            mock_base_connector,
+            mock_credential_manager,
+        )
+
+        chunk = ProcessedResponse(
+            content={
+                "type": "response.function_call_arguments.done",
+                "item_id": "fc_ws_tool",
+                "arguments": '{"command":["bash","-lc","git status --short"]}',
+            },
+            metadata={
+                "event_type": "response.function_call_arguments.done",
+                "tool_call_emitted": False,
+                "finish_reason": None,
+            },
+        )
+
+        normalized = executor._normalize_processed_stream_chunk(chunk)
+
+        assert normalized.metadata.get("tool_call_emitted") is True
+        assert normalized.metadata.get("finish_reason") == "tool_calls"
 
     @pytest.mark.asyncio
     async def test_normalize_processed_stream_chunk_marks_local_shell_item_done_as_tool_output(