Skip to content

Commit e9d0979

Browse files
Merge pull request #150 from togethercomputer/stt_spec
Add streaming STT spec
2 parents e7384a2 + 9a1af69 commit e9d0979

1 file changed

Lines changed: 233 additions & 0 deletions

File tree

openapi.yaml

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3661,6 +3661,239 @@ paths:
36613661
schema:
36623662
$ref: '#/components/schemas/ErrorData'
36633663

3664+
/realtime:
3665+
get:
3666+
tags: ['Audio']
3667+
summary: Real-time audio transcription via WebSocket
3668+
description: |
3669+
Establishes a WebSocket connection for real-time audio transcription. This endpoint uses WebSocket protocol (wss://api.together.ai/v1/realtime) for bidirectional streaming communication.
3670+
3671+
**Connection Setup:**
3672+
- Protocol: WebSocket (wss://)
3673+
- Authentication: Pass API key as Bearer token in Authorization header
3674+
- Parameters: Sent as query parameters (model, input_audio_format)
3675+
3676+
**Client Events:**
3677+
- `input_audio_buffer.append`: Send audio chunks as base64-encoded data
3678+
```json
3679+
{
3680+
"type": "input_audio_buffer.append",
3681+
"audio": "<base64_encoded_audio_chunk>"
3682+
}
3683+
```
3684+
- `input_audio_buffer.commit`: Signal end of audio stream
3685+
```json
3686+
{
3687+
"type": "input_audio_buffer.commit"
3688+
}
3689+
```
3690+
3691+
**Server Events:**
3692+
- `session.created`: Initial session confirmation (sent first)
3693+
```json
3694+
{
3695+
"type": "session.created",
3696+
"session": {
3697+
"id": "session-id",
3698+
"object": "realtime.session",
3699+
"modalities": ["audio"],
3700+
"model": "openai/whisper-large-v3"
3701+
}
3702+
}
3703+
```
3704+
- `conversation.item.input_audio_transcription.delta`: Partial transcription results
3705+
```json
3706+
{
3707+
"type": "conversation.item.input_audio_transcription.delta",
3708+
"delta": "The quick brown"
3709+
}
3710+
```
3711+
- `conversation.item.input_audio_transcription.completed`: Final transcription
3712+
```json
3713+
{
3714+
"type": "conversation.item.input_audio_transcription.completed",
3715+
"transcript": "The quick brown fox jumps over the lazy dog"
3716+
}
3717+
```
3718+
- `conversation.item.input_audio_transcription.failed`: Error occurred
3719+
```json
3720+
{
3721+
"type": "conversation.item.input_audio_transcription.failed",
3722+
"error": {
3723+
"message": "Error description",
3724+
"type": "invalid_request_error",
3725+
"param": null,
3726+
"code": "invalid_api_key"
3727+
}
3728+
}
3729+
```
3730+
3731+
**Error Codes:**
3732+
- `invalid_api_key`: Invalid API key provided (401)
3733+
- `missing_api_key`: Authorization header missing (401)
3734+
- `model_not_available`: Invalid or unavailable model (400)
3735+
- Unsupported audio format errors (400)
3736+
3737+
operationId: realtime-transcription
3738+
x-codeSamples:
3739+
- lang: Python
3740+
label: Python WebSocket Client
3741+
source: |
3742+
import asyncio
3743+
import websockets
3744+
import json
3745+
import base64
3746+
import os
3747+
3748+
async def transcribe_audio():
3749+
api_key = os.environ.get("TOGETHER_API_KEY")
3750+
url = "wss://api.together.ai/v1/realtime?model=openai/whisper-large-v3&input_audio_format=pcm_s16le_16000"
3751+
3752+
headers = {
3753+
"Authorization": f"Bearer {api_key}"
3754+
}
3755+
3756+
async with websockets.connect(url, additional_headers=headers) as ws:
3757+
# Read audio file
3758+
with open("audio.wav", "rb") as f:
3759+
audio_data = f.read()
3760+
3761+
# Send audio in chunks with delay to simulate real-time
3762+
chunk_size = 8192
3763+
bytes_per_second = 16000 * 2 # 16kHz * 2 bytes (16-bit)
3764+
delay_per_chunk = chunk_size / bytes_per_second
3765+
3766+
for i in range(0, len(audio_data), chunk_size):
3767+
chunk = audio_data[i:i+chunk_size]
3768+
base64_chunk = base64.b64encode(chunk).decode('utf-8')
3769+
await ws.send(json.dumps({
3770+
"type": "input_audio_buffer.append",
3771+
"audio": base64_chunk
3772+
}))
3773+
# Simulate real-time streaming
3774+
if i + chunk_size < len(audio_data):
3775+
await asyncio.sleep(delay_per_chunk)
3776+
3777+
# Commit the audio buffer
3778+
await ws.send(json.dumps({
3779+
"type": "input_audio_buffer.commit"
3780+
}))
3781+
3782+
# Receive transcription results
3783+
async for message in ws:
3784+
data = json.loads(message)
3785+
if data["type"] == "conversation.item.input_audio_transcription.delta":
3786+
print(f"Partial: {data['delta']}")
3787+
elif data["type"] == "conversation.item.input_audio_transcription.completed":
3788+
print(f"Final: {data['transcript']}")
3789+
break
3790+
elif data["type"] == "conversation.item.input_audio_transcription.failed":
3791+
error = data.get("error", {})
3792+
print(f"Error: {error.get('message')}")
3793+
break
3794+
3795+
asyncio.run(transcribe_audio())
3796+
- lang: JavaScript
3797+
label: Node.js WebSocket Client
3798+
source: |
3799+
import WebSocket from 'ws';
3800+
import fs from 'fs';
3801+
3802+
const apiKey = process.env.TOGETHER_API_KEY;
3803+
const url = 'wss://api.together.ai/v1/realtime?model=openai/whisper-large-v3&input_audio_format=pcm_s16le_16000';
3804+
3805+
const ws = new WebSocket(url, {
3806+
headers: {
3807+
'Authorization': `Bearer ${apiKey}`
3808+
}
3809+
});
3810+
3811+
ws.on('open', async () => {
3812+
console.log('WebSocket connection established!');
3813+
3814+
// Read audio file
3815+
const audioData = fs.readFileSync('audio.wav');
3816+
3817+
// Send audio in chunks with delay to simulate real-time
3818+
const chunkSize = 8192;
3819+
const bytesPerSecond = 16000 * 2; // 16kHz * 2 bytes (16-bit)
3820+
const delayPerChunk = (chunkSize / bytesPerSecond) * 1000; // Convert to ms
3821+
3822+
for (let i = 0; i < audioData.length; i += chunkSize) {
3823+
const chunk = audioData.slice(i, i + chunkSize);
3824+
const base64Chunk = chunk.toString('base64');
3825+
ws.send(JSON.stringify({
3826+
type: 'input_audio_buffer.append',
3827+
audio: base64Chunk
3828+
}));
3829+
3830+
// Simulate real-time streaming
3831+
if (i + chunkSize < audioData.length) {
3832+
await new Promise(resolve => setTimeout(resolve, delayPerChunk));
3833+
}
3834+
}
3835+
3836+
// Commit audio buffer
3837+
ws.send(JSON.stringify({
3838+
type: 'input_audio_buffer.commit'
3839+
}));
3840+
});
3841+
3842+
ws.on('message', (data) => {
3843+
const message = JSON.parse(data.toString());
3844+
3845+
if (message.type === 'conversation.item.input_audio_transcription.delta') {
3846+
console.log(`Partial: ${message.delta}`);
3847+
} else if (message.type === 'conversation.item.input_audio_transcription.completed') {
3848+
console.log(`Final: ${message.transcript}`);
3849+
ws.close();
3850+
} else if (message.type === 'conversation.item.input_audio_transcription.failed') {
3851+
const errorMessage = message.error?.message ?? message.message ?? 'Unknown error';
3852+
console.error(`Error: ${errorMessage}`);
3853+
ws.close();
3854+
}
3855+
});
3856+
3857+
ws.on('error', (error) => {
3858+
console.error('WebSocket error:', error);
3859+
});
3860+
parameters:
3861+
- in: query
3862+
name: model
3863+
required: true
3864+
schema:
3865+
type: string
3866+
enum:
3867+
- openai/whisper-large-v3
3868+
default: openai/whisper-large-v3
3869+
description: The Whisper model to use for transcription
3870+
- in: query
3871+
name: input_audio_format
3872+
required: true
3873+
schema:
3874+
type: string
3875+
enum:
3876+
- pcm_s16le_16000
3877+
default: pcm_s16le_16000
3878+
description: Audio format specification. Currently supports 16-bit PCM at 16kHz sample rate.
3879+
responses:
3880+
'101':
3881+
description: |
3882+
Switching Protocols - WebSocket connection established successfully.
3883+
3884+
Error message format:
3885+
```json
3886+
{
3887+
"type": "conversation.item.input_audio_transcription.failed",
3888+
"error": {
3889+
"message": "Error description",
3890+
"type": "invalid_request_error",
3891+
"param": null,
3892+
"code": "error_code"
3893+
}
3894+
}
3895+
```
3896+
36643897
components:
36653898
securitySchemes:
36663899
bearerAuth:

0 commit comments

Comments
 (0)