Add optional language parameter to synthesize methods.

shaper · shaper · commit 6188890fba8c · 2024-05-30T15:50:57.000-07:00
- defaults to `en`
- language specified as ISO 639-1 code
- server side for streaming synthesis support not yet in place
- note multilingual support is still in beta
diff --git a/demo/stream.py b/demo/stream.py
@@ -38,7 +38,7 @@ async def writer_task(conn, prompt):
 
 async def main(args):
   speech = Speech(os.getenv('LMNT_API_KEY'))
-  conn = await speech.synthesize_streaming(VOICE_ID, return_extras=False)
+  conn = await speech.synthesize_streaming(VOICE_ID, return_extras=False, language=args.language)
 
   t1 = asyncio.create_task(reader_task(conn))
   t2 = asyncio.create_task(writer_task(conn, args.prompt))
@@ -51,4 +51,5 @@ async def main(args):
 if __name__ == '__main__':
   parser = ArgumentParser()
   parser.add_argument('prompt', default=DEFAULT_PROMPT, nargs='?')
+  parser.add_argument('-l', '--language', required=False, default='en', help='Language code')
   asyncio.run(main(parser.parse_args()))
diff --git a/demo/synthesize.py b/demo/synthesize.py
@@ -18,7 +18,7 @@ async def main(args):
     print(account)
 
     # Synthesize text to speech.
-    synthesize = await s.synthesize(text=args.text, voice=args.voice)
+    synthesize = await s.synthesize(text=args.text, voice=args.voice, language=args.language)
     with open('output.mp3', 'wb') as f:
       f.write(synthesize['audio'])
     print('Done.')
@@ -27,5 +27,6 @@ async def main(args):
   parser = argparse.ArgumentParser(description='Synthesize text to speech using LMNT API')
   parser.add_argument('-t', '--text', required=False, default='This is a test of the LMNT API.', help='Text to synthesize')
   parser.add_argument('-v', '--voice', required=False, default='lily', help='Voice to use')
+  parser.add_argument('-l', '--language', required=False, default='en', help='Language code')
   args = parser.parse_args()
   asyncio.run(main(args))
diff --git a/src/lmnt/api.py b/src/lmnt/api.py
@@ -319,6 +319,7 @@ async def synthesize(self, text: str, voice: str, **kwargs):
     - `speed` (float): Floating point value between 0.25 (slow) and 2.0 (fast); Defaults to 1.0
     - `return_durations` (bool): If `True`, the response will include word durations detail. Defaults to `False`.
     - `return_seed` (bool): If `True`, the response will include the seed used for synthesis. Defaults to `False`.
+    - `language` (str): The desired language of the synthesized speech. Two letter ISO 639-1 code. Defaults to `en`.
     - `length` (int): The desired target length of the output speech in seconds. Maximum 300.0 (5 minutes)
 
     Deprecated parameters:
@@ -362,7 +363,8 @@ async def synthesize(self, text: str, voice: str, **kwargs):
     if return_durations is True:
       form_data.add_field('return_durations', 'true')
     return_seed = kwargs.get('return_seed', False)
-
+    if 'language' in kwargs:
+      form_data.add_field('language', kwargs.get('language'))
     async with self._session.post(url, data=form_data, headers=self._build_headers()) as resp:
       await self._handle_response_errors(resp, 'Speech.synthesize')
       response_data = await resp.json()
@@ -384,6 +386,7 @@ async def synthesize_streaming(self, voice: str, return_extras: bool = False, **
     - `voice` (str): The voice id to use for this connection.
     - `speed` (float): The speed to use for synthesis. Defaults to 1.0.
     - `return_extras` (bool): If `True`, the response will include word durations detail. Defaults to `False`.
+    - `language` (str): The desired language of the synthesized speech. Two letter ISO 639-1 code. Defaults to `en`.
 
     Returns:
     - `StreamingSynthesisConnection`: The streaming connection object.
@@ -406,6 +409,8 @@ async def synthesize_streaming(self, voice: str, return_extras: bool = False, **
     if 'expressive' in kwargs:
       init_msg['expressive'] = kwargs['expressive']
     init_msg['send_extras'] = return_extras
+    if 'language' in kwargs:
+      init_msg['language'] = kwargs['language']
     ws = await self._session.ws_connect(f'{self._base_url}{_SYNTHESIZE_STREAMING_ENDPOINT}')
     await ws.send_str(json.dumps(init_msg))
     return StreamingSynthesisConnection(ws, return_extras)
diff --git a/test/integration/smoke_test.py b/test/integration/smoke_test.py
@@ -107,6 +107,20 @@ async def test_synthesize(api: Speech):
   assert isinstance(result['audio'], bytes)
 
 
+@pytest.mark.asyncio
+async def test_synthesize__non_en_language(api: Speech):
+  voice = 'lily'
+  text = 'Example Text'
+  language = 'pt'
+  result = await api.synthesize(text=text, voice=voice, language=language)
+  assert result is not None
+  assert 'audio' in result
+  assert 'durations' not in result
+  assert 'seed' not in result
+  assert len(result['audio']) > 0
+  assert isinstance(result['audio'], bytes)
+
+
 @pytest.mark.asyncio
 async def test_synthesize_with_empty_voice(api: Speech):
   voice = ''
diff --git a/test/unit/test_speech.py b/test/unit/test_speech.py
@@ -71,6 +71,19 @@ async def test_synthesize_return_durations_and_seed(api):
   assert synthesis_result == {'audio': base64.b64decode(mock_response['audio']), 'durations': mock_response['durations'], 'seed': mock_response['seed']}
 
 
+@pytest.mark.asyncio
+async def test_synthesize__non_en_language(api):
+  text = 'Hello, world!'
+  voice = 'Voice1'
+  language = 'pt'
+  mock_response = {'audio': MOCK_AUDIO, 'durations': [], 'seed': 'random_seed'}
+  api._session.post.return_value.__aenter__.return_value.json = AsyncMock(return_value=mock_response)
+  api._session.post.return_value.__aenter__.return_value.status = 200
+
+  synthesis_result = await api.synthesize(text, voice, language=language)
+  assert synthesis_result == {'audio': base64.b64decode(mock_response['audio'])}
+
+
 @pytest.mark.asyncio
 async def test_synthesize_no_text(api):
   with pytest.raises(AssertionError):
@@ -113,12 +126,13 @@ async def test_synthesize_streaming(api):
   speed = 1.5
   expressive = 0.8
   return_extras = True
+  language = 'pt'
 
   mock_ws = AsyncMock()
   api._session = AsyncMock()
   api._session.ws_connect.return_value = mock_ws
 
-  connection = await api.synthesize_streaming(voice, return_extras=return_extras, speed=speed, expressive=expressive)
+  connection = await api.synthesize_streaming(voice, return_extras=return_extras, speed=speed, expressive=expressive, language=language)
 
   assert isinstance(connection, StreamingSynthesisConnection)
   api._session.ws_connect.assert_called_once_with(f'{api._base_url}{_SYNTHESIZE_STREAMING_ENDPOINT}')
@@ -127,7 +141,8 @@ async def test_synthesize_streaming(api):
       'voice': voice,
       'speed': speed,
       'expressive': expressive,
-      'send_extras': return_extras
+      'send_extras': return_extras,
+      'language': language
   }))