Skip to content

Commit 49eceeb

Browse files
Adding OpenAPI support for Transcriptions and Translations
1 parent 89fc7f9 commit 49eceeb

1 file changed

Lines changed: 338 additions & 0 deletions

File tree

openapi.yaml

Lines changed: 338 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,80 @@ paths:
857857
application/json:
858858
schema:
859859
$ref: '#/components/schemas/ErrorData'
860+
/audio/transcriptions:
861+
post:
862+
tags: ['Audio', 'Transcribe']
863+
summary: Create audio transcription request
864+
description: Transcribes audio into text
865+
operationId: audio-transcriptions
866+
requestBody:
867+
required: true
868+
content:
869+
multipart/form-data:
870+
schema:
871+
$ref: '#/components/schemas/AudioTranscriptionRequest'
872+
responses:
873+
'200':
874+
description: 'OK'
875+
content:
876+
application/json:
877+
schema:
878+
$ref: '#/components/schemas/AudioTranscriptionResponse'
879+
'400':
880+
description: 'BadRequest'
881+
content:
882+
application/json:
883+
schema:
884+
$ref: '#/components/schemas/ErrorData'
885+
'401':
886+
description: 'Unauthorized'
887+
content:
888+
application/json:
889+
schema:
890+
$ref: '#/components/schemas/ErrorData'
891+
'429':
892+
description: 'RateLimit'
893+
content:
894+
application/json:
895+
schema:
896+
$ref: '#/components/schemas/ErrorData'
897+
/audio/translations:
898+
post:
899+
tags: ['Audio', 'Translate']
900+
summary: Create audio translation request
901+
description: Translates audio into English
902+
operationId: audio-translations
903+
requestBody:
904+
required: true
905+
content:
906+
multipart/form-data:
907+
schema:
908+
$ref: '#/components/schemas/AudioTranslationRequest'
909+
responses:
910+
'200':
911+
description: 'OK'
912+
content:
913+
application/json:
914+
schema:
915+
$ref: '#/components/schemas/AudioTranslationResponse'
916+
'400':
917+
description: 'BadRequest'
918+
content:
919+
application/json:
920+
schema:
921+
$ref: '#/components/schemas/ErrorData'
922+
'401':
923+
description: 'Unauthorized'
924+
content:
925+
application/json:
926+
schema:
927+
$ref: '#/components/schemas/ErrorData'
928+
'429':
929+
description: 'RateLimit'
930+
content:
931+
application/json:
932+
schema:
933+
$ref: '#/components/schemas/ErrorData'
860934
/endpoints:
861935
get:
862936
tags: ['Endpoints']
@@ -2310,6 +2384,270 @@ components:
23102384
default: false
23112385
description: 'If true, output is streamed for several characters at a time instead of waiting for the full response. The stream terminates with `data: [DONE]`. If false, return the encoded audio as octet stream'
23122386

2387+
AudioTranscriptionRequest:
2388+
type: object
2389+
required:
2390+
- file
2391+
properties:
2392+
file:
2393+
oneOf:
2394+
- type: string
2395+
format: binary
2396+
description: Audio file to transcribe
2397+
- type: string
2398+
format: uri
2399+
description: Public HTTP/HTTPS URL to audio file
2400+
description: Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a, .webm, .flac.
2401+
model:
2402+
type: string
2403+
description: Model to use for transcription
2404+
default: openai/whisper-large-v3
2405+
enum:
2406+
- openai/whisper-large-v3
2407+
language:
2408+
type: string
2409+
description: Optional ISO 639-1 language code. If omitted, language is auto-detected.
2410+
example: en
2411+
prompt:
2412+
type: string
2413+
description: Optional text to bias decoding.
2414+
response_format:
2415+
type: string
2416+
description: The format of the response
2417+
default: json
2418+
enum:
2419+
- json
2420+
- verbose_json
2421+
temperature:
2422+
type: number
2423+
format: float
2424+
description: Sampling temperature between 0.0 and 1.0
2425+
default: 0.0
2426+
minimum: 0.0
2427+
maximum: 1.0
2428+
timestamp_granularities:
2429+
type: string
2430+
description: Controls level of timestamp detail in verbose_json. Only used when response_format is verbose_json.
2431+
default: segment
2432+
enum:
2433+
- segment
2434+
- word
2435+
2436+
AudioTranscriptionResponse:
2437+
oneOf:
2438+
- $ref: '#/components/schemas/AudioTranscriptionJsonResponse'
2439+
- $ref: '#/components/schemas/AudioTranscriptionVerboseJsonResponse'
2440+
2441+
AudioTranscriptionJsonResponse:
2442+
type: object
2443+
required:
2444+
- text
2445+
properties:
2446+
text:
2447+
type: string
2448+
description: The transcribed text
2449+
example: Hello, world!
2450+
2451+
AudioTranscriptionVerboseJsonResponse:
2452+
type: object
2453+
required:
2454+
- task
2455+
- language
2456+
- duration
2457+
- text
2458+
- segments
2459+
properties:
2460+
task:
2461+
type: string
2462+
description: The task performed
2463+
enum:
2464+
- transcribe
2465+
- translate
2466+
example: transcribe
2467+
language:
2468+
type: string
2469+
description: The language of the audio
2470+
example: english
2471+
duration:
2472+
type: number
2473+
format: float
2474+
description: The duration of the audio in seconds
2475+
example: 3.5
2476+
text:
2477+
type: string
2478+
description: The transcribed text
2479+
example: Hello, world!
2480+
segments:
2481+
type: array
2482+
items:
2483+
$ref: '#/components/schemas/AudioTranscriptionSegment'
2484+
description: Array of transcription segments
2485+
words:
2486+
type: array
2487+
items:
2488+
$ref: '#/components/schemas/AudioTranscriptionWord'
2489+
description: Array of transcription words (only when timestamp_granularities includes 'word')
2490+
2491+
AudioTranscriptionSegment:
2492+
type: object
2493+
required:
2494+
- id
2495+
- start
2496+
- end
2497+
- text
2498+
- tokens
2499+
properties:
2500+
id:
2501+
type: integer
2502+
description: Unique identifier for the segment
2503+
example: 0
2504+
start:
2505+
type: number
2506+
format: float
2507+
description: Start time of the segment in seconds
2508+
example: 0.0
2509+
end:
2510+
type: number
2511+
format: float
2512+
description: End time of the segment in seconds
2513+
example: 3.5
2514+
text:
2515+
type: string
2516+
description: The text content of the segment
2517+
example: Hello, world!
2518+
tokens:
2519+
type: array
2520+
items:
2521+
type: integer
2522+
description: Array of token IDs for the segment
2523+
2524+
AudioTranscriptionWord:
2525+
type: object
2526+
required:
2527+
- word
2528+
- start
2529+
- end
2530+
properties:
2531+
word:
2532+
type: string
2533+
description: The word
2534+
example: Hello
2535+
start:
2536+
type: number
2537+
format: float
2538+
description: Start time of the word in seconds
2539+
example: 0.0
2540+
end:
2541+
type: number
2542+
format: float
2543+
description: End time of the word in seconds
2544+
example: 0.5
2545+
2546+
AudioTranslationRequest:
2547+
type: object
2548+
required:
2549+
- file
2550+
properties:
2551+
file:
2552+
oneOf:
2553+
- type: string
2554+
format: binary
2555+
description: Audio file to translate
2556+
- type: string
2557+
format: uri
2558+
description: Public HTTP/HTTPS URL to audio file
2559+
description: Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a, .webm, .flac.
2560+
model:
2561+
type: string
2562+
description: Model to use for translation
2563+
default: openai/whisper-large-v3
2564+
enum:
2565+
- openai/whisper-large-v3
2566+
language:
2567+
type: string
2568+
description: Target output language. Optional ISO 639-1 language code. If omitted, language is set to English.
2569+
default: en
2570+
example: en
2571+
prompt:
2572+
type: string
2573+
description: Optional text to bias decoding.
2574+
response_format:
2575+
type: string
2576+
description: The format of the response
2577+
default: json
2578+
enum:
2579+
- json
2580+
- verbose_json
2581+
temperature:
2582+
type: number
2583+
format: float
2584+
description: Sampling temperature between 0.0 and 1.0
2585+
default: 0.0
2586+
minimum: 0.0
2587+
maximum: 1.0
2588+
timestamp_granularities:
2589+
type: string
2590+
description: Controls level of timestamp detail in verbose_json. Only used when response_format is verbose_json.
2591+
default: segment
2592+
enum:
2593+
- segment
2594+
- word
2595+
2596+
AudioTranslationResponse:
2597+
oneOf:
2598+
- $ref: '#/components/schemas/AudioTranslationJsonResponse'
2599+
- $ref: '#/components/schemas/AudioTranslationVerboseJsonResponse'
2600+
2601+
AudioTranslationJsonResponse:
2602+
type: object
2603+
required:
2604+
- text
2605+
properties:
2606+
text:
2607+
type: string
2608+
description: The translated text
2609+
example: Hello, world!
2610+
2611+
AudioTranslationVerboseJsonResponse:
2612+
type: object
2613+
required:
2614+
- task
2615+
- language
2616+
- duration
2617+
- text
2618+
- segments
2619+
properties:
2620+
task:
2621+
type: string
2622+
description: The task performed
2623+
enum:
2624+
- transcribe
2625+
- translate
2626+
example: translate
2627+
language:
2628+
type: string
2629+
description: The target language of the translation
2630+
example: english
2631+
duration:
2632+
type: number
2633+
format: float
2634+
description: The duration of the audio in seconds
2635+
example: 3.5
2636+
text:
2637+
type: string
2638+
description: The translated text
2639+
example: Hello, world!
2640+
segments:
2641+
type: array
2642+
items:
2643+
$ref: '#/components/schemas/AudioTranscriptionSegment'
2644+
description: Array of translation segments
2645+
words:
2646+
type: array
2647+
items:
2648+
$ref: '#/components/schemas/AudioTranscriptionWord'
2649+
description: Array of translation words (only when timestamp_granularities includes 'word')
2650+
23132651
AudioSpeechStreamResponse:
23142652
oneOf:
23152653
- $ref: '#/components/schemas/AudioSpeechStreamEvent'

0 commit comments

Comments
 (0)