fix: make audio_interface optional for text-only chat mode (#749)

mvanhorn · claude · web-flow · commit ba601c173727 · 2026-03-26T15:00:03.000+01:00
* fix: make audio_interface optional for text-only chat mode When using chat-only (text_only) mode, requiring an AudioInterface is unnecessary. This makes audio_interface default to None in both Conversation and AsyncConversation, with None-guards around all audio operations (start, stop, output, interrupt). Fixes #632 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: auto-set text_only in config when audio_interface is omitted When audio_interface is None (text-only chat mode), automatically set text_only: True in conversation_config_override so the server skips audio generation. Uses setdefault to preserve explicit user overrides. Applied to both Conversation and AsyncConversation. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * fix: avoid mutating shared config dict when setting text_only Replace setdefault (which mutates the caller's dict in-place) with a new dict spread. Prevents a reused ConversationInitiationData from inheriting text_only: True when creating a second Conversation with an audio_interface. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * test: add text-only mode tests for Conversation Verifies that: - text_only is auto-set when audio_interface is None - explicit text_only=False override is preserved - original config dict is not mutated (dict spread fix) * refactor: move text_only auto-set to BaseConversation, deep-copy config Move the text_only auto-set logic from both Conversation and AsyncConversation into BaseConversation to eliminate duplication. Deep-copy the ConversationInitiationData so the caller's original object is never mutated. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> --------- Co-authored-by: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/src/elevenlabs/conversational_ai/conversation.py b/src/elevenlabs/conversational_ai/conversation.py
@@ -388,6 +388,7 @@ def __init__(
         user_id: Optional[str] = None,
         *,
         requires_auth: bool,
+        audio_interface=None,
         config: Optional[ConversationInitiationData] = None,
         client_tools: Optional[ClientTools] = None,
         on_prem_config: Optional[OnPremInitiationData] = None,
@@ -397,7 +398,16 @@ def __init__(
         self.agent_id = agent_id
         self.user_id = user_id
         self.requires_auth = requires_auth
-        self.config = config or ConversationInitiationData()
+        # Copy the config so we never mutate the caller's original object
+        src = config or ConversationInitiationData()
+        self.config = ConversationInitiationData(
+            extra_body=dict(src.extra_body),
+            conversation_config_override=dict(src.conversation_config_override),
+            dynamic_variables=dict(src.dynamic_variables),
+            user_id=src.user_id,
+        )
+        if audio_interface is None:
+            self.config.conversation_config_override.setdefault("text_only", True)
         self.client_tools = client_tools or ClientTools()
         self.on_prem_config = on_prem_config
         self.environment = environment
@@ -610,7 +620,7 @@ async def _handle_message_core_async(self, message, message_handler):
 
 
 class Conversation(BaseConversation):
-    audio_interface: AudioInterface
+    audio_interface: Optional[AudioInterface]
     callback_agent_response: Optional[Callable[[str], None]]
     callback_agent_response_correction: Optional[Callable[[str, str], None]]
     callback_agent_chat_response_part: Optional[Callable[[str, AgentChatResponsePartType], None]]
@@ -630,7 +640,7 @@ def __init__(
         user_id: Optional[str] = None,
         *,
         requires_auth: bool,
-        audio_interface: AudioInterface,
+        audio_interface: Optional[AudioInterface] = None,
         config: Optional[ConversationInitiationData] = None,
         client_tools: Optional[ClientTools] = None,
         callback_agent_response: Optional[Callable[[str], None]] = None,
@@ -653,6 +663,7 @@ def __init__(
             user_id: The ID of the user conversing with the agent.
             requires_auth: Whether the agent requires authentication.
             audio_interface: The audio interface to use for input and output.
+                Can be omitted for text-only (chat) mode.
             client_tools: The client tools to use for the conversation.
             callback_agent_response: Callback for agent responses.
             callback_agent_response_correction: Callback for agent response corrections.
@@ -671,6 +682,7 @@ def __init__(
             agent_id=agent_id,
             user_id=user_id,
             requires_auth=requires_auth,
+            audio_interface=audio_interface,
             config=config,
             client_tools=client_tools,
             on_prem_config=on_prem_config,
@@ -701,7 +713,8 @@ def start_session(self):
 
     def end_session(self):
         """Ends the conversation session and cleans up resources."""
-        self.audio_interface.stop()
+        if self.audio_interface is not None:
+            self.audio_interface.stop()
         self.client_tools.stop()
         self._ws = None
         self._should_stop.set()
@@ -830,7 +843,8 @@ def input_callback(audio):
                     logger.error(f"Error sending user audio chunk: {e}")
                     self.end_session()
 
-            self.audio_interface.start(input_callback)
+            if self.audio_interface is not None:
+                self.audio_interface.start(input_callback)
             while not self._should_stop.is_set():
                 try:
                     message = json.loads(ws.recv(timeout=0.5))
@@ -860,7 +874,8 @@ def __init__(self, conversation, ws):
                 self.callback_audio_alignment = conversation.callback_audio_alignment
 
             def handle_audio_output(self, audio):
-                self.conversation.audio_interface.output(audio)
+                if self.conversation.audio_interface is not None:
+                    self.conversation.audio_interface.output(audio)
 
             def handle_audio_alignment(self, alignment):
                 self.conversation.callback_audio_alignment(alignment)
@@ -878,7 +893,8 @@ def handle_user_transcript(self, transcript):
                 self.conversation.callback_user_transcript(transcript)
 
             def handle_interruption(self):
-                self.conversation.audio_interface.interrupt()
+                if self.conversation.audio_interface is not None:
+                    self.conversation.audio_interface.interrupt()
 
             def handle_ping(self, event):
                 self.ws.send(
@@ -905,7 +921,7 @@ def send_response(response):
 
 
 class AsyncConversation(BaseConversation):
-    audio_interface: AsyncAudioInterface
+    audio_interface: Optional[AsyncAudioInterface]
     callback_agent_response: Optional[Callable[[str], Awaitable[None]]]
     callback_agent_response_correction: Optional[Callable[[str, str], Awaitable[None]]]
     callback_agent_chat_response_part: Optional[Callable[[str, AgentChatResponsePartType], Awaitable[None]]]
@@ -925,7 +941,7 @@ def __init__(
         user_id: Optional[str] = None,
         *,
         requires_auth: bool,
-        audio_interface: AsyncAudioInterface,
+        audio_interface: Optional[AsyncAudioInterface] = None,
         config: Optional[ConversationInitiationData] = None,
         client_tools: Optional[ClientTools] = None,
         callback_agent_response: Optional[Callable[[str], Awaitable[None]]] = None,
@@ -948,6 +964,7 @@ def __init__(
             user_id: The ID of the user conversing with the agent.
             requires_auth: Whether the agent requires authentication.
             audio_interface: The async audio interface to use for input and output.
+                Can be omitted for text-only (chat) mode.
             client_tools: The client tools to use for the conversation.
             callback_agent_response: Async callback for agent responses.
             callback_agent_response_correction: Async callback for agent response corrections.
@@ -967,6 +984,7 @@ def __init__(
             agent_id=agent_id,
             user_id=user_id,
             requires_auth=requires_auth,
+            audio_interface=audio_interface,
             config=config,
             client_tools=client_tools,
             on_prem_config=on_prem_config,
@@ -996,7 +1014,8 @@ async def start_session(self):
 
     async def end_session(self):
         """Ends the conversation session and cleans up resources."""
-        await self.audio_interface.stop()
+        if self.audio_interface is not None:
+            await self.audio_interface.stop()
         self.client_tools.stop()
         self._ws = None
         self._should_stop.set()
@@ -1124,7 +1143,8 @@ async def input_callback(audio):
                     logger.error(f"Error sending user audio chunk: {e}")
                     await self.end_session()
 
-            await self.audio_interface.start(input_callback)
+            if self.audio_interface is not None:
+                await self.audio_interface.start(input_callback)
 
             try:
                 while not self._should_stop.is_set():
@@ -1159,7 +1179,8 @@ def __init__(self, conversation, ws):
                 self.callback_audio_alignment = conversation.callback_audio_alignment
 
             async def handle_audio_output(self, audio):
-                await self.conversation.audio_interface.output(audio)
+                if self.conversation.audio_interface is not None:
+                    await self.conversation.audio_interface.output(audio)
 
             async def handle_audio_alignment(self, alignment):
                 await self.conversation.callback_audio_alignment(alignment)
@@ -1177,7 +1198,8 @@ async def handle_user_transcript(self, transcript):
                 await self.conversation.callback_user_transcript(transcript)
 
             async def handle_interruption(self):
-                await self.conversation.audio_interface.interrupt()
+                if self.conversation.audio_interface is not None:
+                    await self.conversation.audio_interface.interrupt()
 
             async def handle_ping(self, event):
                 await self.ws.send(
diff --git a/tests/test_convai.py b/tests/test_convai.py
@@ -398,3 +398,68 @@ def streaming_callback(text, part_type):
     assert streaming_calls[3] == ("!", AgentChatResponsePartType.DELTA)
 
     assert streaming_calls[4] == ("", AgentChatResponsePartType.STOP)
+
+
+def test_text_only_mode_auto_sets_config():
+    """When audio_interface is None, text_only should be auto-set in config."""
+    mock_ws = create_mock_websocket()
+    mock_client = MagicMock()
+    mock_client._client_wrapper.get_base_url.return_value = "https://api.elevenlabs.io"
+
+    conversation = Conversation(
+        client=mock_client,
+        agent_id=TEST_AGENT_ID,
+        requires_auth=False,
+        audio_interface=None,
+    )
+
+    assert conversation.config.conversation_config_override.get("text_only") is True
+
+    with patch("elevenlabs.conversational_ai.conversation.connect") as mock_connect:
+        mock_connect.return_value.__enter__.return_value = mock_ws
+        conversation.start_session()
+        conversation.end_session()
+        conversation.wait_for_session_end()
+
+    send_calls = [call[0][0] for call in mock_ws.send.call_args_list]
+    init_messages = [json.loads(call) for call in send_calls if 'conversation_initiation_client_data' in call]
+    assert len(init_messages) == 1
+    assert init_messages[0]["conversation_config_override"]["text_only"] is True
+
+
+def test_text_only_mode_preserves_explicit_override():
+    """When user explicitly sets text_only=False, the override should be preserved."""
+    mock_client = MagicMock()
+    mock_client._client_wrapper.get_base_url.return_value = "https://api.elevenlabs.io"
+
+    config = ConversationInitiationData(conversation_config_override={"text_only": False})
+    conversation = Conversation(
+        client=mock_client,
+        config=config,
+        agent_id=TEST_AGENT_ID,
+        requires_auth=False,
+        audio_interface=None,
+    )
+
+    assert conversation.config.conversation_config_override["text_only"] is False
+
+
+def test_text_only_mode_does_not_mutate_original_config():
+    """Setting text_only should not mutate the caller's original config dict."""
+    mock_client = MagicMock()
+    mock_client._client_wrapper.get_base_url.return_value = "https://api.elevenlabs.io"
+
+    original_override = {"some_setting": "value"}
+    config = ConversationInitiationData(conversation_config_override=original_override)
+
+    Conversation(
+        client=mock_client,
+        config=config,
+        agent_id=TEST_AGENT_ID,
+        requires_auth=False,
+        audio_interface=None,
+    )
+
+    assert "text_only" not in original_override
+    # The ConversationInitiationData object itself should also be unmodified
+    assert "text_only" not in config.conversation_config_override