@@ -857,6 +857,80 @@ paths:
857857 application/json :
858858 schema :
859859 $ref : ' #/components/schemas/ErrorData'
860+ /audio/transcriptions :
861+ post :
862+ tags : ['Audio', 'Transcribe']
863+ summary : Create audio transcription request
864+ description : Transcribes audio into text
865+ operationId : audio-transcriptions
866+ requestBody :
867+ required : true
868+ content :
869+ multipart/form-data :
870+ schema :
871+ $ref : ' #/components/schemas/AudioTranscriptionRequest'
872+ responses :
873+ ' 200 ' :
874+ description : ' OK'
875+ content :
876+ application/json :
877+ schema :
878+ $ref : ' #/components/schemas/AudioTranscriptionResponse'
879+ ' 400 ' :
880+ description : ' BadRequest'
881+ content :
882+ application/json :
883+ schema :
884+ $ref : ' #/components/schemas/ErrorData'
885+ ' 401 ' :
886+ description : ' Unauthorized'
887+ content :
888+ application/json :
889+ schema :
890+ $ref : ' #/components/schemas/ErrorData'
891+ ' 429 ' :
892+ description : ' RateLimit'
893+ content :
894+ application/json :
895+ schema :
896+ $ref : ' #/components/schemas/ErrorData'
897+ /audio/translations :
898+ post :
899+ tags : ['Audio', 'Translate']
900+ summary : Create audio translation request
901+ description : Translates audio into English
902+ operationId : audio-translations
903+ requestBody :
904+ required : true
905+ content :
906+ multipart/form-data :
907+ schema :
908+ $ref : ' #/components/schemas/AudioTranslationRequest'
909+ responses :
910+ ' 200 ' :
911+ description : ' OK'
912+ content :
913+ application/json :
914+ schema :
915+ $ref : ' #/components/schemas/AudioTranslationResponse'
916+ ' 400 ' :
917+ description : ' BadRequest'
918+ content :
919+ application/json :
920+ schema :
921+ $ref : ' #/components/schemas/ErrorData'
922+ ' 401 ' :
923+ description : ' Unauthorized'
924+ content :
925+ application/json :
926+ schema :
927+ $ref : ' #/components/schemas/ErrorData'
928+ ' 429 ' :
929+ description : ' RateLimit'
930+ content :
931+ application/json :
932+ schema :
933+ $ref : ' #/components/schemas/ErrorData'
860934 /endpoints :
861935 get :
862936 tags : ['Endpoints']
@@ -2310,6 +2384,270 @@ components:
23102384 default : false
23112385 description : ' If true, output is streamed for several characters at a time instead of waiting for the full response. The stream terminates with `data: [DONE]`. If false, return the encoded audio as octet stream'
23122386
2387+ AudioTranscriptionRequest :
2388+ type : object
2389+ required :
2390+ - file
2391+ properties :
2392+ file :
2393+ oneOf :
2394+ - type : string
2395+ format : binary
2396+ description : Audio file to transcribe
2397+ - type : string
2398+ format : uri
2399+ description : Public HTTP/HTTPS URL to audio file
2400+ description : Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a, .webm, .flac.
2401+ model :
2402+ type : string
2403+ description : Model to use for transcription
2404+ default : openai/whisper-large-v3
2405+ enum :
2406+ - openai/whisper-large-v3
2407+ language :
2408+ type : string
2409+ description : Optional ISO 639-1 language code. If omitted, language is auto-detected.
2410+ example : en
2411+ prompt :
2412+ type : string
2413+ description : Optional text to bias decoding.
2414+ response_format :
2415+ type : string
2416+ description : The format of the response
2417+ default : json
2418+ enum :
2419+ - json
2420+ - verbose_json
2421+ temperature :
2422+ type : number
2423+ format : float
2424+ description : Sampling temperature between 0.0 and 1.0
2425+ default : 0.0
2426+ minimum : 0.0
2427+ maximum : 1.0
2428+ timestamp_granularities :
2429+ type : string
2430+ description : Controls level of timestamp detail in verbose_json. Only used when response_format is verbose_json.
2431+ default : segment
2432+ enum :
2433+ - segment
2434+ - word
2435+
2436+ AudioTranscriptionResponse :
2437+ oneOf :
2438+ - $ref : ' #/components/schemas/AudioTranscriptionJsonResponse'
2439+ - $ref : ' #/components/schemas/AudioTranscriptionVerboseJsonResponse'
2440+
2441+ AudioTranscriptionJsonResponse :
2442+ type : object
2443+ required :
2444+ - text
2445+ properties :
2446+ text :
2447+ type : string
2448+ description : The transcribed text
2449+ example : Hello, world!
2450+
2451+ AudioTranscriptionVerboseJsonResponse :
2452+ type : object
2453+ required :
2454+ - task
2455+ - language
2456+ - duration
2457+ - text
2458+ - segments
2459+ properties :
2460+ task :
2461+ type : string
2462+ description : The task performed
2463+ enum :
2464+ - transcribe
2465+ - translate
2466+ example : transcribe
2467+ language :
2468+ type : string
2469+ description : The language of the audio
2470+ example : english
2471+ duration :
2472+ type : number
2473+ format : float
2474+ description : The duration of the audio in seconds
2475+ example : 3.5
2476+ text :
2477+ type : string
2478+ description : The transcribed text
2479+ example : Hello, world!
2480+ segments :
2481+ type : array
2482+ items :
2483+ $ref : ' #/components/schemas/AudioTranscriptionSegment'
2484+ description : Array of transcription segments
2485+ words :
2486+ type : array
2487+ items :
2488+ $ref : ' #/components/schemas/AudioTranscriptionWord'
2489+ description : Array of transcription words (only when timestamp_granularities includes 'word')
2490+
2491+ AudioTranscriptionSegment :
2492+ type : object
2493+ required :
2494+ - id
2495+ - start
2496+ - end
2497+ - text
2498+ - tokens
2499+ properties :
2500+ id :
2501+ type : integer
2502+ description : Unique identifier for the segment
2503+ example : 0
2504+ start :
2505+ type : number
2506+ format : float
2507+ description : Start time of the segment in seconds
2508+ example : 0.0
2509+ end :
2510+ type : number
2511+ format : float
2512+ description : End time of the segment in seconds
2513+ example : 3.5
2514+ text :
2515+ type : string
2516+ description : The text content of the segment
2517+ example : Hello, world!
2518+ tokens :
2519+ type : array
2520+ items :
2521+ type : integer
2522+ description : Array of token IDs for the segment
2523+
2524+ AudioTranscriptionWord :
2525+ type : object
2526+ required :
2527+ - word
2528+ - start
2529+ - end
2530+ properties :
2531+ word :
2532+ type : string
2533+ description : The word
2534+ example : Hello
2535+ start :
2536+ type : number
2537+ format : float
2538+ description : Start time of the word in seconds
2539+ example : 0.0
2540+ end :
2541+ type : number
2542+ format : float
2543+ description : End time of the word in seconds
2544+ example : 0.5
2545+
2546+ AudioTranslationRequest :
2547+ type : object
2548+ required :
2549+ - file
2550+ properties :
2551+ file :
2552+ oneOf :
2553+ - type : string
2554+ format : binary
2555+ description : Audio file to translate
2556+ - type : string
2557+ format : uri
2558+ description : Public HTTP/HTTPS URL to audio file
2559+ description : Audio file upload or public HTTP/HTTPS URL. Supported formats .wav, .mp3, .m4a, .webm, .flac.
2560+ model :
2561+ type : string
2562+ description : Model to use for translation
2563+ default : openai/whisper-large-v3
2564+ enum :
2565+ - openai/whisper-large-v3
2566+ language :
2567+ type : string
2568+ description : Target output language. Optional ISO 639-1 language code. If omitted, language is set to English.
2569+ default : en
2570+ example : en
2571+ prompt :
2572+ type : string
2573+ description : Optional text to bias decoding.
2574+ response_format :
2575+ type : string
2576+ description : The format of the response
2577+ default : json
2578+ enum :
2579+ - json
2580+ - verbose_json
2581+ temperature :
2582+ type : number
2583+ format : float
2584+ description : Sampling temperature between 0.0 and 1.0
2585+ default : 0.0
2586+ minimum : 0.0
2587+ maximum : 1.0
2588+ timestamp_granularities :
2589+ type : string
2590+ description : Controls level of timestamp detail in verbose_json. Only used when response_format is verbose_json.
2591+ default : segment
2592+ enum :
2593+ - segment
2594+ - word
2595+
2596+ AudioTranslationResponse :
2597+ oneOf :
2598+ - $ref : ' #/components/schemas/AudioTranslationJsonResponse'
2599+ - $ref : ' #/components/schemas/AudioTranslationVerboseJsonResponse'
2600+
2601+ AudioTranslationJsonResponse :
2602+ type : object
2603+ required :
2604+ - text
2605+ properties :
2606+ text :
2607+ type : string
2608+ description : The translated text
2609+ example : Hello, world!
2610+
2611+ AudioTranslationVerboseJsonResponse :
2612+ type : object
2613+ required :
2614+ - task
2615+ - language
2616+ - duration
2617+ - text
2618+ - segments
2619+ properties :
2620+ task :
2621+ type : string
2622+ description : The task performed
2623+ enum :
2624+ - transcribe
2625+ - translate
2626+ example : translate
2627+ language :
2628+ type : string
2629+ description : The target language of the translation
2630+ example : english
2631+ duration :
2632+ type : number
2633+ format : float
2634+ description : The duration of the audio in seconds
2635+ example : 3.5
2636+ text :
2637+ type : string
2638+ description : The translated text
2639+ example : Hello, world!
2640+ segments :
2641+ type : array
2642+ items :
2643+ $ref : ' #/components/schemas/AudioTranscriptionSegment'
2644+ description : Array of translation segments
2645+ words :
2646+ type : array
2647+ items :
2648+ $ref : ' #/components/schemas/AudioTranscriptionWord'
2649+ description : Array of translation words (only when timestamp_granularities includes 'word')
2650+
23132651 AudioSpeechStreamResponse :
23142652 oneOf :
23152653 - $ref : ' #/components/schemas/AudioSpeechStreamEvent'
0 commit comments