together-python/src/together/resources/audio/speech.py at 5a8c26316b0d022fe8118c9e3c319c1e6283fd31 · togethercomputer/together-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from __future__ import annotations

from typing import Any, AsyncGenerator, Dict, Iterator, List, Union

from together.abstract import api_requestor
from together.together_response import TogetherResponse
from together.types import (
    AudioSpeechRequest,
    AudioResponseFormat,
    AudioLanguage,
    AudioResponseEncoding,
    AudioSpeechStreamChunk,
    AudioSpeechStreamResponse,
    TogetherClient,
    TogetherRequest,
)


class Speech:
    def __init__(self, client: TogetherClient) -> None:
        self._client = client

    def create(
        self,
        *,
        model: str,
        input: str,
        voice: str | None = None,
        response_format: str = "wav",
        language: str = "en",
        response_encoding: str = "pcm_f32le",
        sample_rate: int | None = None,
        stream: bool = False,
        **kwargs: Any,
    ) -> AudioSpeechStreamResponse:
        """
        Method to generate audio from input text using a specified model.

        Args:
            model (str): The name of the model to query.
            input (str): Input text to generate the audio for.
            voice (str, optional): The voice to use for generating the audio.
                Defaults to None.
            response_format (str, optional): The format of audio output.
                Defaults to "wav".
            language (str, optional): Language of input text.
                Defaults to "en".
            response_encoding (str, optional): Audio encoding of response.
                Defaults to "pcm_f32le".
            sample_rate (int, optional): Sampling rate to use for the output audio.
                Defaults to None. If not provided, the default sampling rate for the model will be used.
            stream (bool, optional): If true, output is streamed for several characters at a time.
                Defaults to False.

        Returns:
            Union[bytes, Iterator[AudioSpeechStreamChunk]]: The generated audio as bytes or an iterator over audio stream chunks.
        """

        if sample_rate is None:
            if "cartesia" in model:
                sample_rate = 44100
            else:
                sample_rate = 24000

        requestor = api_requestor.APIRequestor(
            client=self._client,
        )

        parameter_payload = AudioSpeechRequest(
            model=model,
            input=input,
            voice=voice,
            response_format=AudioResponseFormat(response_format),
            language=AudioLanguage(language),
            response_encoding=AudioResponseEncoding(response_encoding),
            sample_rate=sample_rate,
            stream=stream,
            **kwargs,
        ).model_dump(exclude_none=True)

        response, streamed, _ = requestor.request(
            options=TogetherRequest(
                method="POST",
                url="audio/speech",
                params=parameter_payload,
            ),
            stream=stream,
        )

        return AudioSpeechStreamResponse(response=response)


class AsyncSpeech:
    def __init__(self, client: TogetherClient) -> None:
        self._client = client

    async def create(
        self,
        *,
        model: str,
        input: str,
        voice: str | None = None,
        response_format: str = "wav",
        language: str = "en",
        response_encoding: str = "pcm_f32le",
        sample_rate: int = 44100,
        stream: bool = False,
        **kwargs: Any,
    ) -> AudioSpeechStreamResponse:
        """
        Async method to generate audio from input text using a specified model.

        Args:
            model (str): The name of the model to query.
            input (str): Input text to generate the audio for.
            voice (str, optional): The voice to use for generating the audio.
                Defaults to None.
            response_format (str, optional): The format of audio output.
                Defaults to "wav".
            language (str, optional): Language of input text.
                Defaults to "en".
            response_encoding (str, optional): Audio encoding of response.
                Defaults to "pcm_f32le".
            sample_rate (int, optional): Sampling rate to use for the output audio.
                Defaults to 44100.
            stream (bool, optional): If true, output is streamed for several characters at a time.
                Defaults to False.

        Returns:
            Union[bytes, AsyncGenerator[AudioSpeechStreamChunk, None]]: The generated audio as bytes or an async generator over audio stream chunks.
        """

        requestor = api_requestor.APIRequestor(
            client=self._client,
        )

        parameter_payload = AudioSpeechRequest(
            model=model,
            input=input,
            voice=voice,
            response_format=AudioResponseFormat(response_format),
            language=AudioLanguage(language),
            response_encoding=AudioResponseEncoding(response_encoding),
            sample_rate=sample_rate,
            stream=stream,
            **kwargs,
        ).model_dump(exclude_none=True)

        response, _, _ = await requestor.arequest(
            options=TogetherRequest(
                method="POST",
                url="audio/speech",
                params=parameter_payload,
            ),
            stream=stream,
        )

        return AudioSpeechStreamResponse(response=response)