@@ -3661,6 +3661,238 @@ paths:
36613661 schema :
36623662 $ref : ' #/components/schemas/ErrorData'
36633663
3664+ /realtime :
3665+ get :
3666+ tags : ['Audio']
3667+ summary : Real-time audio transcription via WebSocket
3668+ description : |
3669+ Establishes a WebSocket connection for real-time audio transcription. This endpoint uses WebSocket protocol (wss://api.together.ai/v1/realtime) for bidirectional streaming communication.
3670+
3671+ **Connection Setup:**
3672+ - Protocol: WebSocket (wss://)
3673+ - Authentication: Pass API key as Bearer token in Authorization header
3674+ - Parameters: Sent as query parameters (model, input_audio_format)
3675+
3676+ **Client Events:**
3677+ - `input_audio_buffer.append`: Send audio chunks as base64-encoded data
3678+ ```json
3679+ {
3680+ "type": "input_audio_buffer.append",
3681+ "audio": "<base64_encoded_audio_chunk>"
3682+ }
3683+ ```
3684+ - `input_audio_buffer.commit`: Signal end of audio stream
3685+ ```json
3686+ {
3687+ "type": "input_audio_buffer.commit"
3688+ }
3689+ ```
3690+
3691+ **Server Events:**
3692+ - `session.created`: Initial session confirmation (sent first)
3693+ ```json
3694+ {
3695+ "type": "session.created",
3696+ "session": {
3697+ "id": "session-id",
3698+ "object": "realtime.session",
3699+ "modalities": ["audio"],
3700+ "model": "openai/whisper-large-v3"
3701+ }
3702+ }
3703+ ```
3704+ - `conversation.item.input_audio_transcription.delta`: Partial transcription results
3705+ ```json
3706+ {
3707+ "type": "conversation.item.input_audio_transcription.delta",
3708+ "delta": "The quick brown"
3709+ }
3710+ ```
3711+ - `conversation.item.input_audio_transcription.completed`: Final transcription
3712+ ```json
3713+ {
3714+ "type": "conversation.item.input_audio_transcription.completed",
3715+ "transcript": "The quick brown fox jumps over the lazy dog"
3716+ }
3717+ ```
3718+ - `conversation.item.input_audio_transcription.failed`: Error occurred
3719+ ```json
3720+ {
3721+ "type": "conversation.item.input_audio_transcription.failed",
3722+ "error": {
3723+ "message": "Error description",
3724+ "type": "invalid_request_error",
3725+ "param": null,
3726+ "code": "invalid_api_key"
3727+ }
3728+ }
3729+ ```
3730+
3731+ **Error Codes:**
3732+ - `invalid_api_key`: Invalid API key provided (401)
3733+ - `missing_api_key`: Authorization header missing (401)
3734+ - `model_not_available`: Invalid or unavailable model (400)
3735+ - Unsupported audio format errors (400)
3736+
3737+ operationId : realtime-transcription
3738+ x-codeSamples :
3739+ - lang : Python
3740+ label : Python WebSocket Client
3741+ source : |
3742+ import asyncio
3743+ import websockets
3744+ import json
3745+ import base64
3746+ import os
3747+
3748+ async def transcribe_audio():
3749+ api_key = os.environ.get("TOGETHER_API_KEY")
3750+ url = "wss://api.together.ai/v1/realtime?model=openai/whisper-large-v3&input_audio_format=pcm_s16le_16000"
3751+
3752+ headers = {
3753+ "Authorization": f"Bearer {api_key}"
3754+ }
3755+
3756+ async with websockets.connect(url, additional_headers=headers) as ws:
3757+ # Read audio file
3758+ with open("audio.wav", "rb") as f:
3759+ audio_data = f.read()
3760+
3761+ # Send audio in chunks with delay to simulate real-time
3762+ chunk_size = 8192
3763+ bytes_per_second = 16000 * 2 # 16kHz * 2 bytes (16-bit)
3764+ delay_per_chunk = chunk_size / bytes_per_second
3765+
3766+ for i in range(0, len(audio_data), chunk_size):
3767+ chunk = audio_data[i:i+chunk_size]
3768+ base64_chunk = base64.b64encode(chunk).decode('utf-8')
3769+ await ws.send(json.dumps({
3770+ "type": "input_audio_buffer.append",
3771+ "audio": base64_chunk
3772+ }))
3773+ # Simulate real-time streaming
3774+ if i + chunk_size < len(audio_data):
3775+ await asyncio.sleep(delay_per_chunk)
3776+
3777+ # Commit the audio buffer
3778+ await ws.send(json.dumps({
3779+ "type": "input_audio_buffer.commit"
3780+ }))
3781+
3782+ # Receive transcription results
3783+ async for message in ws:
3784+ data = json.loads(message)
3785+ if data["type"] == "conversation.item.input_audio_transcription.delta":
3786+ print(f"Partial: {data['delta']}")
3787+ elif data["type"] == "conversation.item.input_audio_transcription.completed":
3788+ print(f"Final: {data['transcript']}")
3789+ break
3790+ elif data["type"] == "conversation.item.input_audio_transcription.failed":
3791+ error = data.get("error", {})
3792+ print(f"Error: {error.get('message')}")
3793+ break
3794+
3795+ asyncio.run(transcribe_audio())
3796+ - lang : JavaScript
3797+ label : Node.js WebSocket Client
3798+ source : |
3799+ import WebSocket from 'ws';
3800+ import fs from 'fs';
3801+
3802+ const apiKey = process.env.TOGETHER_API_KEY;
3803+ const url = 'wss://api.together.ai/v1/realtime?model=openai/whisper-large-v3&input_audio_format=pcm_s16le_16000';
3804+
3805+ const ws = new WebSocket(url, {
3806+ headers: {
3807+ 'Authorization': `Bearer ${apiKey}`
3808+ }
3809+ });
3810+
3811+ ws.on('open', async () => {
3812+ console.log('WebSocket connection established!');
3813+
3814+ // Read audio file
3815+ const audioData = fs.readFileSync('audio.wav');
3816+
3817+ // Send audio in chunks with delay to simulate real-time
3818+ const chunkSize = 8192;
3819+ const bytesPerSecond = 16000 * 2; // 16kHz * 2 bytes (16-bit)
3820+ const delayPerChunk = (chunkSize / bytesPerSecond) * 1000; // Convert to ms
3821+
3822+ for (let i = 0; i < audioData.length; i += chunkSize) {
3823+ const chunk = audioData.slice(i, i + chunkSize);
3824+ const base64Chunk = chunk.toString('base64');
3825+ ws.send(JSON.stringify({
3826+ type: 'input_audio_buffer.append',
3827+ audio: base64Chunk
3828+ }));
3829+
3830+ // Simulate real-time streaming
3831+ if (i + chunkSize < audioData.length) {
3832+ await new Promise(resolve => setTimeout(resolve, delayPerChunk));
3833+ }
3834+ }
3835+
3836+ // Commit audio buffer
3837+ ws.send(JSON.stringify({
3838+ type: 'input_audio_buffer.commit'
3839+ }));
3840+ });
3841+
3842+ ws.on('message', (data) => {
3843+ const message = JSON.parse(data.toString());
3844+
3845+ if (message.type === 'conversation.item.input_audio_transcription.delta') {
3846+ console.log(`Partial: ${message.delta}`);
3847+ } else if (message.type === 'conversation.item.input_audio_transcription.completed') {
3848+ console.log(`Final: ${message.transcript}`);
3849+ ws.close();
3850+ } else if (message.type === 'conversation.item.input_audio_transcription.failed') {
3851+ console.error(`Error: ${message.error.message}`);
3852+ ws.close();
3853+ }
3854+ });
3855+
3856+ ws.on('error', (error) => {
3857+ console.error('WebSocket error:', error);
3858+ });
3859+ parameters :
3860+ - in : query
3861+ name : model
3862+ required : true
3863+ schema :
3864+ type : string
3865+ enum :
3866+ - openai/whisper-large-v3
3867+ default : openai/whisper-large-v3
3868+ description : The Whisper model to use for transcription
3869+ - in : query
3870+ name : input_audio_format
3871+ required : true
3872+ schema :
3873+ type : string
3874+ enum :
3875+ - pcm_s16le_16000
3876+ default : pcm_s16le_16000
3877+ description : Audio format specification. Currently supports 16-bit PCM at 16kHz sample rate.
3878+ responses :
3879+ ' 101 ' :
3880+ description : |
3881+ Switching Protocols - WebSocket connection established successfully.
3882+
3883+ Error message format:
3884+ ```json
3885+ {
3886+ "type": "conversation.item.input_audio_transcription.failed",
3887+ "error": {
3888+ "message": "Error description",
3889+ "type": "invalid_request_error",
3890+ "param": null,
3891+ "code": "error_code"
3892+ }
3893+ }
3894+ ```
3895+
36643896components :
36653897 securitySchemes :
36663898 bearerAuth :
0 commit comments