Skip to content

Commit 353bd62

Browse files
Merge pull request #110 from togethercomputer/transcriptions-support
Adding OpenAPI support for Transcriptions and Translations
2 parents 89fc7f9 + 20cc9a2 commit 353bd62

1 file changed

Lines changed: 339 additions & 0 deletions

File tree

openapi.yaml

Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,80 @@ paths:
857857
application/json:
858858
schema:
859859
$ref: '#/components/schemas/ErrorData'
860+
/audio/transcriptions:
861+
post:
862+
tags: ['Audio', 'Transcribe']
863+
summary: Create audio transcription request
864+
description: Transcribes audio into text
865+
operationId: audio-transcriptions
866+
requestBody:
867+
required: true
868+
content:
869+
multipart/form-data:
870+
schema:
871+
$ref: '#/components/schemas/AudioTranscriptionRequest'
872+
responses:
873+
'200':
874+
description: 'OK'
875+
content:
876+
application/json:
877+
schema:
878+
$ref: '#/components/schemas/AudioTranscriptionResponse'
879+
'400':
880+
description: 'BadRequest'
881+
content:
882+
application/json:
883+
schema:
884+
$ref: '#/components/schemas/ErrorData'
885+
'401':
886+
description: 'Unauthorized'
887+
content:
888+
application/json:
889+
schema:
890+
$ref: '#/components/schemas/ErrorData'
891+
'429':
892+
description: 'RateLimit'
893+
content:
894+
application/json:
895+
schema:
896+
$ref: '#/components/schemas/ErrorData'
897+
/audio/translations:
898+
post:
899+
tags: ['Audio', 'Translate']
900+
summary: Create audio translation request
901+
description: Translates audio into English
902+
operationId: audio-translations
903+
requestBody:
904+
required: true
905+
content:
906+
multipart/form-data:
907+
schema:
908+
$ref: '#/components/schemas/AudioTranslationRequest'
909+
responses:
910+
'200':
911+
description: 'OK'
912+
content:
913+
application/json:
914+
schema:
915+
$ref: '#/components/schemas/AudioTranslationResponse'
916+
'400':
917+
description: 'BadRequest'
918+
content:
919+
application/json:
920+
schema:
921+
$ref: '#/components/schemas/ErrorData'
922+
'401':
923+
description: 'Unauthorized'
924+
content:
925+
application/json:
926+
schema:
927+
$ref: '#/components/schemas/ErrorData'
928+
'429':
929+
description: 'RateLimit'
930+
content:
931+
application/json:
932+
schema:
933+
$ref: '#/components/schemas/ErrorData'
860934
/endpoints:
861935
get:
862936
tags: ['Endpoints']
@@ -2310,6 +2384,271 @@ components:
23102384
default: false
23112385
description: 'If true, output is streamed for several characters at a time instead of waiting for the full response. The stream terminates with `data: [DONE]`. If false, return the encoded audio as octet stream'
23122386

2387+
AudioTranscriptionRequest:
2388+
type: object
2389+
required:
2390+
- file
2391+
properties:
2392+
file:
2393+
oneOf:
2394+
- type: string
2395+
format: binary
2396+
description: Audio file to transcribe
2397+
- type: string
2398+
format: uri
2399+
description: Public HTTP/HTTPS URL to audio file
2400+
description: Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a, .webm, .flac.
2401+
model:
2402+
type: string
2403+
description: Model to use for transcription
2404+
default: openai/whisper-large-v3
2405+
enum:
2406+
- openai/whisper-large-v3
2407+
language:
2408+
type: string
2409+
description: Optional ISO 639-1 language code. If `auto` is provided, language is auto-detected.
2410+
default: en
2411+
example: en
2412+
prompt:
2413+
type: string
2414+
description: Optional text to bias decoding.
2415+
response_format:
2416+
type: string
2417+
description: The format of the response
2418+
default: json
2419+
enum:
2420+
- json
2421+
- verbose_json
2422+
temperature:
2423+
type: number
2424+
format: float
2425+
description: Sampling temperature between 0.0 and 1.0
2426+
default: 0.0
2427+
minimum: 0.0
2428+
maximum: 1.0
2429+
timestamp_granularities:
2430+
type: string
2431+
description: Controls level of timestamp detail in verbose_json. Only used when response_format is verbose_json.
2432+
default: segment
2433+
enum:
2434+
- segment
2435+
- word
2436+
2437+
AudioTranscriptionResponse:
2438+
oneOf:
2439+
- $ref: '#/components/schemas/AudioTranscriptionJsonResponse'
2440+
- $ref: '#/components/schemas/AudioTranscriptionVerboseJsonResponse'
2441+
2442+
AudioTranscriptionJsonResponse:
2443+
type: object
2444+
required:
2445+
- text
2446+
properties:
2447+
text:
2448+
type: string
2449+
description: The transcribed text
2450+
example: Hello, world!
2451+
2452+
AudioTranscriptionVerboseJsonResponse:
2453+
type: object
2454+
required:
2455+
- task
2456+
- language
2457+
- duration
2458+
- text
2459+
- segments
2460+
properties:
2461+
task:
2462+
type: string
2463+
description: The task performed
2464+
enum:
2465+
- transcribe
2466+
- translate
2467+
example: transcribe
2468+
language:
2469+
type: string
2470+
description: The language of the audio
2471+
example: english
2472+
duration:
2473+
type: number
2474+
format: float
2475+
description: The duration of the audio in seconds
2476+
example: 3.5
2477+
text:
2478+
type: string
2479+
description: The transcribed text
2480+
example: Hello, world!
2481+
segments:
2482+
type: array
2483+
items:
2484+
$ref: '#/components/schemas/AudioTranscriptionSegment'
2485+
description: Array of transcription segments
2486+
words:
2487+
type: array
2488+
items:
2489+
$ref: '#/components/schemas/AudioTranscriptionWord'
2490+
description: Array of transcription words (only when timestamp_granularities includes 'word')
2491+
2492+
AudioTranscriptionSegment:
2493+
type: object
2494+
required:
2495+
- id
2496+
- start
2497+
- end
2498+
- text
2499+
- tokens
2500+
properties:
2501+
id:
2502+
type: integer
2503+
description: Unique identifier for the segment
2504+
example: 0
2505+
start:
2506+
type: number
2507+
format: float
2508+
description: Start time of the segment in seconds
2509+
example: 0.0
2510+
end:
2511+
type: number
2512+
format: float
2513+
description: End time of the segment in seconds
2514+
example: 3.5
2515+
text:
2516+
type: string
2517+
description: The text content of the segment
2518+
example: Hello, world!
2519+
tokens:
2520+
type: array
2521+
items:
2522+
type: integer
2523+
description: Array of token IDs for the segment
2524+
2525+
AudioTranscriptionWord:
2526+
type: object
2527+
required:
2528+
- word
2529+
- start
2530+
- end
2531+
properties:
2532+
word:
2533+
type: string
2534+
description: The word
2535+
example: Hello
2536+
start:
2537+
type: number
2538+
format: float
2539+
description: Start time of the word in seconds
2540+
example: 0.0
2541+
end:
2542+
type: number
2543+
format: float
2544+
description: End time of the word in seconds
2545+
example: 0.5
2546+
2547+
AudioTranslationRequest:
2548+
type: object
2549+
required:
2550+
- file
2551+
properties:
2552+
file:
2553+
oneOf:
2554+
- type: string
2555+
format: binary
2556+
description: Audio file to translate
2557+
- type: string
2558+
format: uri
2559+
description: Public HTTP/HTTPS URL to audio file
2560+
description: Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a, .webm, .flac.
2561+
model:
2562+
type: string
2563+
description: Model to use for translation
2564+
default: openai/whisper-large-v3
2565+
enum:
2566+
- openai/whisper-large-v3
2567+
language:
2568+
type: string
2569+
description: Target output language. Optional ISO 639-1 language code. If omitted, language is set to English.
2570+
default: en
2571+
example: en
2572+
prompt:
2573+
type: string
2574+
description: Optional text to bias decoding.
2575+
response_format:
2576+
type: string
2577+
description: The format of the response
2578+
default: json
2579+
enum:
2580+
- json
2581+
- verbose_json
2582+
temperature:
2583+
type: number
2584+
format: float
2585+
description: Sampling temperature between 0.0 and 1.0
2586+
default: 0.0
2587+
minimum: 0.0
2588+
maximum: 1.0
2589+
timestamp_granularities:
2590+
type: string
2591+
description: Controls level of timestamp detail in verbose_json. Only used when response_format is verbose_json.
2592+
default: segment
2593+
enum:
2594+
- segment
2595+
- word
2596+
2597+
AudioTranslationResponse:
2598+
oneOf:
2599+
- $ref: '#/components/schemas/AudioTranslationJsonResponse'
2600+
- $ref: '#/components/schemas/AudioTranslationVerboseJsonResponse'
2601+
2602+
AudioTranslationJsonResponse:
2603+
type: object
2604+
required:
2605+
- text
2606+
properties:
2607+
text:
2608+
type: string
2609+
description: The translated text
2610+
example: Hello, world!
2611+
2612+
AudioTranslationVerboseJsonResponse:
2613+
type: object
2614+
required:
2615+
- task
2616+
- language
2617+
- duration
2618+
- text
2619+
- segments
2620+
properties:
2621+
task:
2622+
type: string
2623+
description: The task performed
2624+
enum:
2625+
- transcribe
2626+
- translate
2627+
example: translate
2628+
language:
2629+
type: string
2630+
description: The target language of the translation
2631+
example: english
2632+
duration:
2633+
type: number
2634+
format: float
2635+
description: The duration of the audio in seconds
2636+
example: 3.5
2637+
text:
2638+
type: string
2639+
description: The translated text
2640+
example: Hello, world!
2641+
segments:
2642+
type: array
2643+
items:
2644+
$ref: '#/components/schemas/AudioTranscriptionSegment'
2645+
description: Array of translation segments
2646+
words:
2647+
type: array
2648+
items:
2649+
$ref: '#/components/schemas/AudioTranscriptionWord'
2650+
description: Array of translation words (only when timestamp_granularities includes 'word')
2651+
23132652
AudioSpeechStreamResponse:
23142653
oneOf:
23152654
- $ref: '#/components/schemas/AudioSpeechStreamEvent'

0 commit comments

Comments
 (0)