feat: add reference_id parameter to TTS conversion methods

twangodev · twangodev · commit b1d0129dbc21 · 2025-11-11T01:56:47.000-06:00
Signed-off-by: James Ding &lt;jamesding365@gmail.com&gt;
diff --git a/src/fishaudio/resources/tts.py b/src/fishaudio/resources/tts.py
@@ -58,6 +58,7 @@ def convert(
         self,
         *,
         text: str,
+        reference_id: Optional[str] = None,
         config: TTSConfig = TTSConfig(),
         model: Model = "s1",
         request_options: Optional[RequestOptions] = None,
@@ -67,6 +68,7 @@ def convert(
 
         Args:
             text: Text to synthesize
+            reference_id: Voice reference ID (overridden by config.reference_id if set)
             config: TTS configuration (audio settings, voice, model parameters)
             model: TTS model to use
             request_options: Request-level overrides
@@ -83,6 +85,9 @@ def convert(
             # Simple usage with defaults
             audio = client.tts.convert(text="Hello world")
 
+            # With reference_id parameter
+            audio = client.tts.convert(text="Hello world", reference_id="your_model_id")
+
             # Custom configuration
             config = TTSConfig(format="wav", mp3_bitrate=192)
             audio = client.tts.convert(text="Hello world", config=config)
@@ -94,6 +99,11 @@ def convert(
         """
         # Build request payload from config
         request = _config_to_tts_request(config, text)
+
+        # Use parameter reference_id only if config doesn't have one
+        if request.reference_id is None and reference_id is not None:
+            request.reference_id = reference_id
+
         payload = request.model_dump(exclude_none=True)
 
         # Make request with streaming
@@ -114,6 +124,7 @@ def stream_websocket(
         self,
         text_stream: Iterable[Union[str, TextEvent, FlushEvent]],
         *,
+        reference_id: Optional[str] = None,
         config: TTSConfig = TTSConfig(),
         model: Model = "s1",
         max_workers: int = 10,
@@ -125,6 +136,7 @@ def stream_websocket(
 
         Args:
             text_stream: Iterator of text chunks to stream
+            reference_id: Voice reference ID (overridden by config.reference_id if set)
             config: TTS configuration (audio settings, voice, model parameters)
             model: TTS model to use
             max_workers: ThreadPoolExecutor workers for concurrent sender
@@ -148,6 +160,11 @@ def text_generator():
                 for audio_chunk in client.tts.stream_websocket(text_generator()):
                     f.write(audio_chunk)
 
+            # With reference_id parameter
+            with open("output.mp3", "wb") as f:
+                for audio_chunk in client.tts.stream_websocket(text_generator(), reference_id="your_model_id"):
+                    f.write(audio_chunk)
+
             # Custom configuration
             config = TTSConfig(format="wav", latency="normal")
             with open("output.wav", "wb") as f:
@@ -158,6 +175,10 @@ def text_generator():
         # Build TTSRequest from config
         tts_request = _config_to_tts_request(config, text="")
 
+        # Use parameter reference_id only if config doesn't have one
+        if tts_request.reference_id is None and reference_id is not None:
+            tts_request.reference_id = reference_id
+
         executor = ThreadPoolExecutor(max_workers=max_workers)
 
         try:
@@ -202,6 +223,7 @@ async def convert(
         self,
         *,
         text: str,
+        reference_id: Optional[str] = None,
         config: TTSConfig = TTSConfig(),
         model: Model = "s1",
         request_options: Optional[RequestOptions] = None,
@@ -211,6 +233,7 @@ async def convert(
 
         Args:
             text: Text to synthesize
+            reference_id: Voice reference ID (overridden by config.reference_id if set)
             config: TTS configuration (audio settings, voice, model parameters)
             model: TTS model to use
             request_options: Request-level overrides
@@ -227,6 +250,9 @@ async def convert(
             # Simple usage with defaults
             audio = await client.tts.convert(text="Hello world")
 
+            # With reference_id parameter
+            audio = await client.tts.convert(text="Hello world", reference_id="your_model_id")
+
             # Custom configuration
             config = TTSConfig(format="wav", mp3_bitrate=192)
             audio = await client.tts.convert(text="Hello world", config=config)
@@ -238,6 +264,11 @@ async def convert(
         """
         # Build request payload from config
         request = _config_to_tts_request(config, text)
+
+        # Use parameter reference_id only if config doesn't have one
+        if request.reference_id is None and reference_id is not None:
+            request.reference_id = reference_id
+
         payload = request.model_dump(exclude_none=True)
 
         # Make request with streaming
@@ -258,6 +289,7 @@ async def stream_websocket(
         self,
         text_stream: AsyncIterable[Union[str, TextEvent, FlushEvent]],
         *,
+        reference_id: Optional[str] = None,
         config: TTSConfig = TTSConfig(),
         model: Model = "s1",
     ):
@@ -268,6 +300,7 @@ async def stream_websocket(
 
         Args:
             text_stream: Async iterator of text chunks to stream
+            reference_id: Voice reference ID (overridden by config.reference_id if set)
             config: TTS configuration (audio settings, voice, model parameters)
             model: TTS model to use
 
@@ -290,6 +323,11 @@ async def text_generator():
                 async for audio_chunk in client.tts.stream_websocket(text_generator()):
                     await f.write(audio_chunk)
 
+            # With reference_id parameter
+            async with aiofiles.open("output.mp3", "wb") as f:
+                async for audio_chunk in client.tts.stream_websocket(text_generator(), reference_id="your_model_id"):
+                    await f.write(audio_chunk)
+
             # Custom configuration
             config = TTSConfig(format="wav", latency="normal")
             async with aiofiles.open("output.wav", "wb") as f:
@@ -300,6 +338,10 @@ async def text_generator():
         # Build TTSRequest from config
         tts_request = _config_to_tts_request(config, text="")
 
+        # Use parameter reference_id only if config doesn't have one
+        if tts_request.reference_id is None and reference_id is not None:
+            tts_request.reference_id = reference_id
+
         ws: AsyncWebSocketSession
         async with aconnect_ws(
             "/v1/tts/live",
diff --git a/tests/unit/test_tts.py b/tests/unit/test_tts.py
@@ -81,6 +81,39 @@ def test_convert_with_reference_id(self, tts_client, mock_client_wrapper):
         payload = ormsgpack.unpackb(call_args[1]["content"])
         assert payload["reference_id"] == "voice_123"
 
+    def test_convert_with_reference_id_parameter(self, tts_client, mock_client_wrapper):
+        """Test TTS with reference_id as direct parameter."""
+        mock_response = Mock()
+        mock_response.iter_bytes.return_value = iter([b"audio"])
+        mock_client_wrapper.request.return_value = mock_response
+
+        list(tts_client.convert(text="Hello", reference_id="voice_456"))
+
+        # Verify reference_id in payload
+        call_args = mock_client_wrapper.request.call_args
+        payload = ormsgpack.unpackb(call_args[1]["content"])
+        assert payload["reference_id"] == "voice_456"
+
+    def test_convert_config_reference_id_overrides_parameter(
+        self, tts_client, mock_client_wrapper
+    ):
+        """Test that config.reference_id overrides parameter reference_id."""
+        mock_response = Mock()
+        mock_response.iter_bytes.return_value = iter([b"audio"])
+        mock_client_wrapper.request.return_value = mock_response
+
+        config = TTSConfig(reference_id="voice_from_config")
+        list(
+            tts_client.convert(
+                text="Hello", reference_id="voice_from_param", config=config
+            )
+        )
+
+        # Verify config reference_id takes precedence
+        call_args = mock_client_wrapper.request.call_args
+        payload = ormsgpack.unpackb(call_args[1]["content"])
+        assert payload["reference_id"] == "voice_from_config"
+
     def test_convert_with_references(self, tts_client, mock_client_wrapper):
         """Test TTS with reference audio samples."""
         mock_response = Mock()
@@ -282,6 +315,55 @@ async def async_iter_bytes():
         payload = ormsgpack.unpackb(call_args[1]["content"])
         assert payload["reference_id"] == "voice_123"
 
+    @pytest.mark.asyncio
+    async def test_convert_with_reference_id_parameter(
+        self, async_tts_client, async_mock_client_wrapper
+    ):
+        """Test async TTS with reference_id as direct parameter."""
+        mock_response = Mock()
+
+        async def async_iter_bytes():
+            yield b"audio"
+
+        mock_response.aiter_bytes = async_iter_bytes
+        async_mock_client_wrapper.request = AsyncMock(return_value=mock_response)
+
+        audio_chunks = []
+        async for chunk in async_tts_client.convert(
+            text="Hello", reference_id="voice_456"
+        ):
+            audio_chunks.append(chunk)
+
+        # Verify reference_id in payload
+        call_args = async_mock_client_wrapper.request.call_args
+        payload = ormsgpack.unpackb(call_args[1]["content"])
+        assert payload["reference_id"] == "voice_456"
+
+    @pytest.mark.asyncio
+    async def test_convert_config_reference_id_overrides_parameter(
+        self, async_tts_client, async_mock_client_wrapper
+    ):
+        """Test that config.reference_id overrides parameter reference_id (async)."""
+        mock_response = Mock()
+
+        async def async_iter_bytes():
+            yield b"audio"
+
+        mock_response.aiter_bytes = async_iter_bytes
+        async_mock_client_wrapper.request = AsyncMock(return_value=mock_response)
+
+        config = TTSConfig(reference_id="voice_from_config")
+        audio_chunks = []
+        async for chunk in async_tts_client.convert(
+            text="Hello", reference_id="voice_from_param", config=config
+        ):
+            audio_chunks.append(chunk)
+
+        # Verify config reference_id takes precedence
+        call_args = async_mock_client_wrapper.request.call_args
+        payload = ormsgpack.unpackb(call_args[1]["content"])
+        assert payload["reference_id"] == "voice_from_config"
+
     @pytest.mark.asyncio
     async def test_convert_with_prosody(
         self, async_tts_client, async_mock_client_wrapper