@@ -857,6 +857,80 @@ paths:
857857 application/json :
858858 schema :
859859 $ref : ' #/components/schemas/ErrorData'
860+ /audio/transcriptions :
861+ post :
862+ tags : ['Audio', 'Transcribe']
863+ summary : Create audio transcription request
864+ description : Transcribes audio into text
865+ operationId : audio-transcriptions
866+ requestBody :
867+ required : true
868+ content :
869+ multipart/form-data :
870+ schema :
871+ $ref : ' #/components/schemas/AudioTranscriptionRequest'
872+ responses :
873+ ' 200 ' :
874+ description : ' OK'
875+ content :
876+ application/json :
877+ schema :
878+ $ref : ' #/components/schemas/AudioTranscriptionResponse'
879+ ' 400 ' :
880+ description : ' BadRequest'
881+ content :
882+ application/json :
883+ schema :
884+ $ref : ' #/components/schemas/ErrorData'
885+ ' 401 ' :
886+ description : ' Unauthorized'
887+ content :
888+ application/json :
889+ schema :
890+ $ref : ' #/components/schemas/ErrorData'
891+ ' 429 ' :
892+ description : ' RateLimit'
893+ content :
894+ application/json :
895+ schema :
896+ $ref : ' #/components/schemas/ErrorData'
897+ /audio/translations :
898+ post :
899+ tags : ['Audio', 'Translate']
900+ summary : Create audio translation request
901+ description : Translates audio into English
902+ operationId : audio-translations
903+ requestBody :
904+ required : true
905+ content :
906+ multipart/form-data :
907+ schema :
908+ $ref : ' #/components/schemas/AudioTranslationRequest'
909+ responses :
910+ ' 200 ' :
911+ description : ' OK'
912+ content :
913+ application/json :
914+ schema :
915+ $ref : ' #/components/schemas/AudioTranslationResponse'
916+ ' 400 ' :
917+ description : ' BadRequest'
918+ content :
919+ application/json :
920+ schema :
921+ $ref : ' #/components/schemas/ErrorData'
922+ ' 401 ' :
923+ description : ' Unauthorized'
924+ content :
925+ application/json :
926+ schema :
927+ $ref : ' #/components/schemas/ErrorData'
928+ ' 429 ' :
929+ description : ' RateLimit'
930+ content :
931+ application/json :
932+ schema :
933+ $ref : ' #/components/schemas/ErrorData'
860934 /endpoints :
861935 get :
862936 tags : ['Endpoints']
@@ -2310,6 +2384,271 @@ components:
23102384 default : false
23112385 description : ' If true, output is streamed for several characters at a time instead of waiting for the full response. The stream terminates with `data: [DONE]`. If false, return the encoded audio as octet stream'
23122386
2387+ AudioTranscriptionRequest :
2388+ type : object
2389+ required :
2390+ - file
2391+ properties :
2392+ file :
2393+ oneOf :
2394+ - type : string
2395+ format : binary
2396+ description : Audio file to transcribe
2397+ - type : string
2398+ format : uri
2399+ description : Public HTTP/HTTPS URL to audio file
2400+ description : Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a, .webm, .flac.
2401+ model :
2402+ type : string
2403+ description : Model to use for transcription
2404+ default : openai/whisper-large-v3
2405+ enum :
2406+ - openai/whisper-large-v3
2407+ language :
2408+ type : string
2409+ description : Optional ISO 639-1 language code. If `auto` is provided, language is auto-detected.
2410+ default : en
2411+ example : en
2412+ prompt :
2413+ type : string
2414+ description : Optional text to bias decoding.
2415+ response_format :
2416+ type : string
2417+ description : The format of the response
2418+ default : json
2419+ enum :
2420+ - json
2421+ - verbose_json
2422+ temperature :
2423+ type : number
2424+ format : float
2425+ description : Sampling temperature between 0.0 and 1.0
2426+ default : 0.0
2427+ minimum : 0.0
2428+ maximum : 1.0
2429+ timestamp_granularities :
2430+ type : string
2431+ description : Controls level of timestamp detail in verbose_json. Only used when response_format is verbose_json.
2432+ default : segment
2433+ enum :
2434+ - segment
2435+ - word
2436+
2437+ AudioTranscriptionResponse :
2438+ oneOf :
2439+ - $ref : ' #/components/schemas/AudioTranscriptionJsonResponse'
2440+ - $ref : ' #/components/schemas/AudioTranscriptionVerboseJsonResponse'
2441+
2442+ AudioTranscriptionJsonResponse :
2443+ type : object
2444+ required :
2445+ - text
2446+ properties :
2447+ text :
2448+ type : string
2449+ description : The transcribed text
2450+ example : Hello, world!
2451+
2452+ AudioTranscriptionVerboseJsonResponse :
2453+ type : object
2454+ required :
2455+ - task
2456+ - language
2457+ - duration
2458+ - text
2459+ - segments
2460+ properties :
2461+ task :
2462+ type : string
2463+ description : The task performed
2464+ enum :
2465+ - transcribe
2466+ - translate
2467+ example : transcribe
2468+ language :
2469+ type : string
2470+ description : The language of the audio
2471+ example : english
2472+ duration :
2473+ type : number
2474+ format : float
2475+ description : The duration of the audio in seconds
2476+ example : 3.5
2477+ text :
2478+ type : string
2479+ description : The transcribed text
2480+ example : Hello, world!
2481+ segments :
2482+ type : array
2483+ items :
2484+ $ref : ' #/components/schemas/AudioTranscriptionSegment'
2485+ description : Array of transcription segments
2486+ words :
2487+ type : array
2488+ items :
2489+ $ref : ' #/components/schemas/AudioTranscriptionWord'
2490+ description : Array of transcription words (only when timestamp_granularities includes 'word')
2491+
2492+ AudioTranscriptionSegment :
2493+ type : object
2494+ required :
2495+ - id
2496+ - start
2497+ - end
2498+ - text
2499+ - tokens
2500+ properties :
2501+ id :
2502+ type : integer
2503+ description : Unique identifier for the segment
2504+ example : 0
2505+ start :
2506+ type : number
2507+ format : float
2508+ description : Start time of the segment in seconds
2509+ example : 0.0
2510+ end :
2511+ type : number
2512+ format : float
2513+ description : End time of the segment in seconds
2514+ example : 3.5
2515+ text :
2516+ type : string
2517+ description : The text content of the segment
2518+ example : Hello, world!
2519+ tokens :
2520+ type : array
2521+ items :
2522+ type : integer
2523+ description : Array of token IDs for the segment
2524+
2525+ AudioTranscriptionWord :
2526+ type : object
2527+ required :
2528+ - word
2529+ - start
2530+ - end
2531+ properties :
2532+ word :
2533+ type : string
2534+ description : The word
2535+ example : Hello
2536+ start :
2537+ type : number
2538+ format : float
2539+ description : Start time of the word in seconds
2540+ example : 0.0
2541+ end :
2542+ type : number
2543+ format : float
2544+ description : End time of the word in seconds
2545+ example : 0.5
2546+
2547+ AudioTranslationRequest :
2548+ type : object
2549+ required :
2550+ - file
2551+ properties :
2552+ file :
2553+ oneOf :
2554+ - type : string
2555+ format : binary
2556+ description : Audio file to translate
2557+ - type : string
2558+ format : uri
2559+ description : Public HTTP/HTTPS URL to audio file
2560+ description : Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a, .webm, .flac.
2561+ model :
2562+ type : string
2563+ description : Model to use for translation
2564+ default : openai/whisper-large-v3
2565+ enum :
2566+ - openai/whisper-large-v3
2567+ language :
2568+ type : string
2569+ description : Target output language. Optional ISO 639-1 language code. If omitted, language is set to English.
2570+ default : en
2571+ example : en
2572+ prompt :
2573+ type : string
2574+ description : Optional text to bias decoding.
2575+ response_format :
2576+ type : string
2577+ description : The format of the response
2578+ default : json
2579+ enum :
2580+ - json
2581+ - verbose_json
2582+ temperature :
2583+ type : number
2584+ format : float
2585+ description : Sampling temperature between 0.0 and 1.0
2586+ default : 0.0
2587+ minimum : 0.0
2588+ maximum : 1.0
2589+ timestamp_granularities :
2590+ type : string
2591+ description : Controls level of timestamp detail in verbose_json. Only used when response_format is verbose_json.
2592+ default : segment
2593+ enum :
2594+ - segment
2595+ - word
2596+
2597+ AudioTranslationResponse :
2598+ oneOf :
2599+ - $ref : ' #/components/schemas/AudioTranslationJsonResponse'
2600+ - $ref : ' #/components/schemas/AudioTranslationVerboseJsonResponse'
2601+
2602+ AudioTranslationJsonResponse :
2603+ type : object
2604+ required :
2605+ - text
2606+ properties :
2607+ text :
2608+ type : string
2609+ description : The translated text
2610+ example : Hello, world!
2611+
2612+ AudioTranslationVerboseJsonResponse :
2613+ type : object
2614+ required :
2615+ - task
2616+ - language
2617+ - duration
2618+ - text
2619+ - segments
2620+ properties :
2621+ task :
2622+ type : string
2623+ description : The task performed
2624+ enum :
2625+ - transcribe
2626+ - translate
2627+ example : translate
2628+ language :
2629+ type : string
2630+ description : The target language of the translation
2631+ example : english
2632+ duration :
2633+ type : number
2634+ format : float
2635+ description : The duration of the audio in seconds
2636+ example : 3.5
2637+ text :
2638+ type : string
2639+ description : The translated text
2640+ example : Hello, world!
2641+ segments :
2642+ type : array
2643+ items :
2644+ $ref : ' #/components/schemas/AudioTranscriptionSegment'
2645+ description : Array of translation segments
2646+ words :
2647+ type : array
2648+ items :
2649+ $ref : ' #/components/schemas/AudioTranscriptionWord'
2650+ description : Array of translation words (only when timestamp_granularities includes 'word')
2651+
23132652 AudioSpeechStreamResponse :
23142653 oneOf :
23152654 - $ref : ' #/components/schemas/AudioSpeechStreamEvent'
0 commit comments