@@ -41,14 +41,6 @@ class AudioResponseEncoding(str, Enum):
4141 PCM_ALAW = "pcm_alaw"
4242
4343
44- class AudioObjectType (str , Enum ):
45- AUDIO_TTS_CHUNK = "audio.tts.chunk"
46-
47-
48- class StreamSentinelType (str , Enum ):
49- DONE = "[DONE]"
50-
51-
5244class AudioSpeechRequest (BaseModel ):
5345 model : str
5446 input : str
@@ -61,21 +53,8 @@ class AudioSpeechRequest(BaseModel):
6153
6254
6355class AudioSpeechStreamChunk (BaseModel ):
64- object : AudioObjectType = AudioObjectType .AUDIO_TTS_CHUNK
65- model : str
66- b64 : str
67-
68-
69- class AudioSpeechStreamEvent (BaseModel ):
70- data : AudioSpeechStreamChunk
71-
72-
73- class StreamSentinel (BaseModel ):
74- data : StreamSentinelType = StreamSentinelType .DONE
75-
76-
77- class AudioSpeechStreamEventResponse (BaseModel ):
78- response : AudioSpeechStreamEvent | StreamSentinel
56+ type : str = "conversation.item.audio_output.delta"
57+ delta : str
7958
8059
8160class AudioSpeechStreamResponse (BaseModel ):
@@ -127,18 +106,10 @@ def stream_to_file(
127106 if isinstance (chunk .data , bytes ):
128107 audio_chunks .append (chunk .data )
129108 elif isinstance (chunk .data , dict ):
130- # SSE format with JSON/base64
131- try :
132- stream_event = AudioSpeechStreamEventResponse (
133- response = {"data" : chunk .data }
134- )
135- if isinstance (stream_event .response , StreamSentinel ):
136- break
137- audio_chunks .append (
138- base64 .b64decode (stream_event .response .data .b64 )
139- )
140- except Exception :
141- continue # Skip malformed chunks
109+ # SSE format: {"type": "conversation.item.audio_output.delta", "delta": "<base64>"}
110+ delta = chunk .data .get ("delta" )
111+ if delta :
112+ audio_chunks .append (base64 .b64decode (delta ))
142113
143114 if not audio_chunks :
144115 raise ValueError ("No audio data received in streaming response" )
0 commit comments