diff --git a/samples/HttpSpeechSynthesizerUsage.java b/samples/HttpSpeechSynthesizerUsage.java new file mode 100644 index 0000000..4586a0e --- /dev/null +++ b/samples/HttpSpeechSynthesizerUsage.java @@ -0,0 +1,234 @@ +// Copyright (c) Alibaba, Inc. and its affiliates. + +import com.alibaba.dashscope.audio.http_tts.AudioInfo; +import com.alibaba.dashscope.audio.http_tts.HttpSpeechSynthesisParam; +import com.alibaba.dashscope.audio.http_tts.HttpSpeechSynthesisResult; +import com.alibaba.dashscope.audio.http_tts.HttpSpeechSynthesizer; +import com.alibaba.dashscope.common.ResultCallback; +import com.alibaba.dashscope.exception.ApiException; +import com.alibaba.dashscope.exception.InputRequiredException; +import com.alibaba.dashscope.exception.NoApiKeyException; +import com.alibaba.dashscope.utils.Constants; + +import java.io.FileOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.concurrent.CountDownLatch; + +/** + * Example usage of HttpSpeechSynthesizer for HTTP SSE-based text-to-speech synthesis. + * + *

Make sure to set the DASHSCOPE_API_KEY environment variable before running this example. + * + * @author DashScope SDK Team + */ +public class HttpSpeechSynthesizerUsage { + + /** + * Demonstrates synchronous call with SSE - blocks until synthesis is complete and returns audio + * data. + */ + public static void syncCall() { + System.out.println("=== Synchronous Call with SSE Example ==="); + + // Create synthesizer + HttpSpeechSynthesizer synthesizer = new HttpSpeechSynthesizer(); + + // Build parameters + HttpSpeechSynthesisParam param = + HttpSpeechSynthesisParam.builder() + .model("cosyvoice-v3-flash") + .text("我家的后面有一个很大的园。") + .voice("longanyang") + .format("wav") + .sampleRate(24000) + .build(); + + try { + // Call and get complete audio data + ByteBuffer audioData = synthesizer.callAndReturnAudio(param); + + // Save to file + if (audioData != null && audioData.hasRemaining()) { + byte[] bytes = new byte[audioData.remaining()]; + audioData.get(bytes); + + try (FileOutputStream fos = new FileOutputStream("sync_output.wav")) { + fos.write(bytes); + System.out.println("Audio saved to sync_output.wav, size: " + bytes.length + " bytes"); + } catch (IOException e) { + System.err.println("Failed to save audio: " + e.getMessage()); + } + } + + } catch (ApiException | NoApiKeyException | InputRequiredException e) { + System.err.println("Synthesis failed: " + e.getMessage()); + } + } + + /** + * Demonstrates synchronous call without SSE - returns audio URL instead of audio data. This is a + * simpler and faster way to get the synthesized audio. + */ + public static void syncCallWithUrl() { + System.out.println("\n=== Synchronous Call without SSE (returns Audio URL) ==="); + + HttpSpeechSynthesizer synthesizer = new HttpSpeechSynthesizer(); + + HttpSpeechSynthesisParam param = + HttpSpeechSynthesisParam.builder() + .model("cosyvoice-v3-flash") + .text("我家的后面有一个很大的园。") + .voice("longanyang") + .format("wav") + .sampleRate(24000) + .build(); + + try { + // Non-SSE call - returns result with audio URL + HttpSpeechSynthesisResult result = synthesizer.call(param); + + System.out.println("Request ID: " + result.getRequestId()); + System.out.println("Finish Reason: " + result.getFinishReason()); + + if (result.hasAudioUrl()) { + AudioInfo audioInfo = result.getAudioInfo(); + System.out.println("\nAudio URL: " + audioInfo.getUrl()); + System.out.println("Audio ID: " + audioInfo.getId()); + System.out.println("Expires At: " + audioInfo.getExpiresAt()); + System.out.println("Remaining Time: " + audioInfo.getRemainingSeconds() + " seconds"); + System.out.println("URL Expired: " + audioInfo.isExpired()); + + // You can download the audio from the URL + // Example: use HttpURLConnection or any HTTP client to download + System.out.println("\nTip: You can download the audio file from the URL above."); + } + + } catch (ApiException | NoApiKeyException | InputRequiredException e) { + System.err.println("Synthesis failed: " + e.getMessage()); + } + } + + /** Demonstrates streaming call with callback - receives audio chunks as they arrive. */ + public static void streamCallWithCallback() { + System.out.println("\n=== Streaming Call with Callback Example ==="); + + HttpSpeechSynthesizer synthesizer = new HttpSpeechSynthesizer(); + + HttpSpeechSynthesisParam param = + HttpSpeechSynthesisParam.builder() + .model("cosyvoice-v3-flash") + .text("今天天气真好,适合出去玩。") + .voice("longanyang") + .format("wav") + .sampleRate(24000) + .build(); + + // Use CountDownLatch to wait for completion + CountDownLatch latch = new CountDownLatch(1); + + try { + synthesizer.streamCall( + param, + new ResultCallback() { + private int chunkCount = 0; + + @Override + public void onEvent(HttpSpeechSynthesisResult result) { + chunkCount++; + if (result.hasAudioData()) { + System.out.println( + "Received chunk #" + + chunkCount + + ", size: " + + result.getAudioDataSize() + + " bytes"); + } + if (result.getRequestId() != null) { + System.out.println("Request ID: " + result.getRequestId()); + } + } + + @Override + public void onComplete() { + System.out.println("✓ Synthesis completed! Total chunks received: " + chunkCount); + + // Get accumulated audio data + ByteBuffer audioData = synthesizer.getAccumulatedAudioData(); + if (audioData != null) { + System.out.println("Total audio size: " + audioData.remaining() + " bytes"); + } + latch.countDown(); + } + + @Override + public void onError(Exception e) { + System.err.println("✗ Error during synthesis: " + e.getMessage()); + latch.countDown(); + } + }); + + // Wait for completion + latch.await(); + System.out.println("Done!"); + + } catch (ApiException | NoApiKeyException | InputRequiredException | InterruptedException e) { + System.err.println("Failed: " + e.getMessage()); + } + } + + /** Demonstrates custom parameter settings. */ + public static void customParameters() { + System.out.println("\n=== Custom Parameters Example ==="); + + HttpSpeechSynthesizer synthesizer = new HttpSpeechSynthesizer(); + + // Build parameters with custom voice settings + HttpSpeechSynthesisParam param = + HttpSpeechSynthesisParam.builder() + .model("cosyvoice-v3-flash") + .text("这是一段测试语音合成参数的文本。") + .voice("longanyang") + .format("wav") + .sampleRate(24000) + .volume(80) // Volume: 0-100 + .rate(1.2f) // Speech rate: 0.5-2.0 + .pitch(1.1f) // Pitch: 0.5-2.0 + .build(); + + System.out.println("Parameters:"); + System.out.println(" Model: " + param.getModel()); + System.out.println(" Text: " + param.getText()); + System.out.println(" Voice: " + param.getVoice()); + System.out.println(" Format: " + param.getFormat()); + System.out.println(" Sample Rate: " + param.getSampleRate()); + System.out.println(" Volume: " + param.getVolume()); + System.out.println(" Rate: " + param.getRate()); + System.out.println(" Pitch: " + param.getPitch()); + + try { + ByteBuffer audioData = synthesizer.callAndReturnAudio(param); + if (audioData != null) { + System.out.println( + "✓ Synthesis completed, audio size: " + audioData.remaining() + " bytes"); + } + } catch (ApiException | NoApiKeyException | InputRequiredException e) { + System.err.println("Failed: " + e.getMessage()); + } + } + + public static void main(String[] args) { + Constants.apiKey = "YOUR_API_KEY"; + System.out.println("HttpSpeechSynthesizer Usage Examples\n"); + System.out.println("====================================\n"); + + // Run examples + syncCall(); // SSE streaming - returns audio data + syncCallWithUrl(); // Non-SSE - returns audio URL + streamCallWithCallback(); + customParameters(); + + System.out.println("\n===================================="); + System.out.println("All examples completed!"); + } +} diff --git a/samples/Qwen3OmniToolCallUsage.java b/samples/Qwen3OmniToolCallUsage.java new file mode 100644 index 0000000..634eaec --- /dev/null +++ b/samples/Qwen3OmniToolCallUsage.java @@ -0,0 +1,365 @@ +import com.alibaba.dashscope.audio.omni.*; +import com.alibaba.dashscope.exception.NoApiKeyException; +import com.google.gson.Gson; +import com.google.gson.JsonObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.FileInputStream; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.atomic.AtomicReference; + +/** + * Qwen3 Omni Tool Calling Support + * + * This example demonstrates: + * 1. Function calling (tool) support with weather and flight price queries + * 2. Using createItem to send tool call results back to the server + */ +public class Qwen3OmniToolCallUsage { + private static final Logger log = LoggerFactory.getLogger(Qwen3OmniToolCallUsage.class); + private static final int AUDIO_CHUNK_SIZE = 3200; // Audio chunk size in bytes (200ms at 16kHz) + private static final int SLEEP_INTERVAL_MS = 200; // Sleep interval to simulate real-time streaming + + // Store pending tool calls that need response + private static final Map pendingToolCalls = new ConcurrentHashMap<>(); + + public static void main(String[] args) throws InterruptedException { + // Build connection parameters + OmniRealtimeParam param = OmniRealtimeParam.builder() + .model("model-name") // Replace with your model + .apikey("your-api-key") + .url("wss://dashscope.aliyuncs.com/api-ws/v1/realtime") // Custom URL if needed + .build(); + + final AtomicReference responseTextRef = new AtomicReference<>(new StringBuilder()); + final CountDownLatch finishLatch = new CountDownLatch(1); + + // Create conversation with callback + OmniRealtimeConversation conversation = new OmniRealtimeConversation(param, new OmniRealtimeCallback() { + private long lastPackageTime = 0; + private boolean isFirstText = true; + private boolean isFirstAudio = true; + + @Override + public void onOpen() { + System.out.println("connection opened, ready to send audio"); + lastPackageTime = System.currentTimeMillis(); + } + + @Override + public void onEvent(JsonObject message) { + String type = message.get("type").getAsString(); + + switch (type) { + case "session.created": + System.out.println("start session: " + message.get("session").getAsJsonObject().get("id").getAsString()); + break; + + case "conversation.item.input_audio_transcription.completed": + System.out.println("question: " + message.get("transcript").getAsString()); + break; + + case "response.audio_transcript.delta": + case "response.text.delta": + if (isFirstText) { + isFirstText = false; + System.out.println("first text latency from vad end: " + (System.currentTimeMillis() - lastPackageTime) + " ms"); + } + String text = message.get("delta").getAsString(); + responseTextRef.get().append(text); + break; + + case "response.audio.delta": + if (isFirstAudio) { + isFirstAudio = false; + System.out.println("first audio latency from vad end: " + (System.currentTimeMillis() - lastPackageTime) + " ms"); + } + System.out.println("audio interval: " + (System.currentTimeMillis() - lastPackageTime) + " ms"); + lastPackageTime = System.currentTimeMillis(); + String recvAudioB64 = message.get("delta").getAsString(); + // Handle received audio - implement your own audio player here + // audioPlayer.write(recvAudioB64); + break; + + case "input_audio_buffer.speech_started": + System.out.println("======VAD Speech Start======"); + // Cancel audio playback when user starts speaking + // audioPlayer.cancelPlaying(); + break; + + case "input_audio_buffer.speech_stopped": + System.out.println("======VAD Speech End======"); + lastPackageTime = System.currentTimeMillis(); + isFirstText = true; + isFirstAudio = true; + pendingToolCalls.clear(); + break; + + case "response.function_call_arguments.done": + System.out.println("======TOOL CALL======"); + String toolCallId = message.get("call_id").getAsString(); + pendingToolCalls.put(toolCallId, message); + break; + + case "response.done": + System.out.println("======RESPONSE DONE======"); + System.out.println("all response text: " + responseTextRef.get()); + responseTextRef.set(new StringBuilder()); // Clear for next response + break; + + default: + break; + } + } + + @Override + public void onClose(int code, String reason) { + System.out.println("connection closed with code: " + code + ", reason: " + reason); + finishLatch.countDown(); + } + }); + + try { + conversation.connect(); + } catch (NoApiKeyException e) { + throw new RuntimeException(e); + } + + // Build tools definition + List> tools = buildTools(); + + // Configure session with tools and server VAD + Map extraParams = new HashMap<>(); + extraParams.put("tools", tools); + + OmniRealtimeConfig config = OmniRealtimeConfig.builder() + .modalities(Arrays.asList(OmniRealtimeModality.AUDIO, OmniRealtimeModality.TEXT)) + .voice("Ethan") // Voice name + .inputAudioFormat(OmniRealtimeAudioFormat.PCM_16000HZ_MONO_16BIT) + .outputAudioFormat(OmniRealtimeAudioFormat.PCM_24000HZ_MONO_16BIT) + .enableInputAudioTranscription(true) + .InputAudioTranscription("gummy-realtime-v1") // Transcription model + .enableTurnDetection(true) + .turnDetectionType("server_vad") + .parameters(extraParams) // Pass tools through extra parameters + .build(); + + conversation.updateSession(config); + + System.out.println("Press 'Ctrl+C' to stop conversation..."); + + // Main loop - read audio from file and send to server + // In a real application, you would read from microphone + String filePath = "./weather.wav"; + File audioFile = new File(filePath); + + if (!audioFile.exists()) { + log.error("Audio file not found: {}", filePath); + System.out.println("Waiting for interactive session. Press Ctrl+C to exit."); + // For demo purposes, just wait + finishLatch.await(); + return; + } + + try (FileInputStream audioInputStream = new FileInputStream(audioFile)) { + byte[] audioBuffer = new byte[AUDIO_CHUNK_SIZE]; + int bytesRead; + + log.info("Starting to send audio data from: {}", filePath); + + while ((bytesRead = audioInputStream.read(audioBuffer)) != -1) { + // Check and handle pending tool calls + boolean needResponse = handlePendingToolCalls(conversation); + + if (needResponse) { + System.out.println("*** create response after call tools"); + conversation.createResponse(null, Arrays.asList(OmniRealtimeModality.AUDIO, OmniRealtimeModality.TEXT)); + System.out.println("======TOOL CALL END======"); + } + + // Send audio data + String audioB64 = Base64.getEncoder().encodeToString(Arrays.copyOf(audioBuffer, bytesRead)); + conversation.appendAudio(audioB64); + + // Add small delay to simulate real-time audio streaming + Thread.sleep(SLEEP_INTERVAL_MS); + } + + log.info("Finished sending audio data."); + + } catch (Exception e) { + log.error("Error sending audio from file: {}", filePath, e); + } + //wait 5 seconds for demo response done + Thread.sleep(5 * 1000); + conversation.close(1000, "bye"); + finishLatch.await(); + System.exit(0); + } + + /** + * Build tool definitions in OpenAI format + */ + private static List> buildTools() { + List> tools = new ArrayList<>(); + + // Tool: get_current_weather + Map weatherTool = new HashMap<>(); + weatherTool.put("type", "function"); + Map weatherFunction = new HashMap<>(); + weatherFunction.put("name", "get_current_weather"); + weatherFunction.put("description", "当你想查询指定城市的天气时非常有用。"); + Map weatherParams = new HashMap<>(); + weatherParams.put("type", "object"); + Map locationProp = new HashMap<>(); + locationProp.put("type", "string"); + locationProp.put("description", "城市或县区,比如北京市、杭州市、余杭区等。"); + Map weatherProps = new HashMap<>(); + weatherProps.put("location", locationProp); + weatherParams.put("properties", weatherProps); + weatherParams.put("required", Collections.singletonList("location")); + weatherFunction.put("parameters", weatherParams); + weatherTool.put("function", weatherFunction); + tools.add(weatherTool); + + // Tool: get_flight_price + Map flightTool = new HashMap<>(); + flightTool.put("type", "function"); + Map flightFunction = new HashMap<>(); + flightFunction.put("name", "get_flight_price"); + flightFunction.put("description", "当你想查询飞机票价格时非常有用。"); + Map flightParams = new HashMap<>(); + flightParams.put("type", "object"); + Map srcProp = new HashMap<>(); + srcProp.put("type", "string"); + srcProp.put("description", "飞机起飞的城市,比如北京市、杭州市等。"); + Map dstProp = new HashMap<>(); + dstProp.put("type", "string"); + dstProp.put("description", "飞机降落的城市,比如北京市、杭州市区等。"); + Map flightProps = new HashMap<>(); + flightProps.put("src", srcProp); + flightProps.put("dst", dstProp); + flightParams.put("properties", flightProps); + flightParams.put("required", Arrays.asList("src", "dst")); + flightFunction.put("parameters", flightParams); + flightTool.put("function", flightFunction); + tools.add(flightTool); + + // Tool: get_train_price + Map trainTool = new HashMap<>(); + trainTool.put("type", "function"); + Map trainFunction = new HashMap<>(); + trainFunction.put("name", "get_train_price"); + trainFunction.put("description", "当你想查询火车票价格时非常有用。"); + Map trainParams = new HashMap<>(); + trainParams.put("type", "object"); + Map trainSrcProp = new HashMap<>(); + trainSrcProp.put("type", "string"); + trainSrcProp.put("description", "火车出发的城市,比如北京市、杭州市等。"); + Map trainDstProp = new HashMap<>(); + trainDstProp.put("type", "string"); + trainDstProp.put("description", "火车到达的城市,比如北京市、杭州市区等。"); + Map trainProps = new HashMap<>(); + trainProps.put("src", trainSrcProp); + trainProps.put("dst", trainDstProp); + trainParams.put("properties", trainProps); + trainParams.put("required", Arrays.asList("src", "dst")); + trainFunction.put("parameters", trainParams); + trainTool.put("function", trainFunction); + tools.add(trainTool); + + return tools; + } + + /** + * Handle pending tool calls by executing local functions and sending results back + */ + private static boolean handlePendingToolCalls(OmniRealtimeConversation conversation) { + boolean needResponse = false; + + for (Map.Entry entry : pendingToolCalls.entrySet()) { + JsonObject toolCallResponse = entry.getValue(); + + // Process tool call + JsonObject result = handleToolCall(toolCallResponse); + + // Send result back using createItem + sendToolCallResult(conversation, result); + + needResponse = true; + pendingToolCalls.remove(entry.getKey()); + } + + return needResponse; + } + + /** + * Handle a single tool call and return the result + */ + private static JsonObject handleToolCall(JsonObject toolCallResponse) { + String functionName = toolCallResponse.get("name").getAsString(); + JsonObject arguments = new Gson().fromJson(toolCallResponse.get("arguments").getAsString(), JsonObject.class); + + System.out.println("[Tool Call] start handling tool call: name: " + functionName + ", args: " + arguments); + + String output; + switch (functionName) { + case "get_current_weather": + String location = arguments.get("location").getAsString(); + output = getCurrentWeather(location); + break; + case "get_flight_price": + String src = arguments.get("src").getAsString(); + String dst = arguments.get("dst").getAsString(); + output = getFlightPrice(src, dst); + break; + case "get_train_price": + String trainSrc = arguments.get("src").getAsString(); + String trainDst = arguments.get("dst").getAsString(); + output = getTrainPrice(trainSrc, trainDst); + break; + default: + output = "client没有找到这个工具,调用失败。"; + break; + } + + System.out.println("[Tool Call] tool call response: " + output); + + // Build result object + JsonObject result = new JsonObject(); + result.addProperty("call_id", toolCallResponse.get("call_id").getAsString()); + result.addProperty("output", output); + return result; + } + + /** + * Send tool call result back to server using createItem + */ + private static void sendToolCallResult(OmniRealtimeConversation conversation, JsonObject result) { + JsonObject item = new JsonObject(); + item.addProperty("id", "item_" + UUID.randomUUID().toString().replace("-", "")); + item.addProperty("type", "function_call_output"); + item.addProperty("call_id", result.get("call_id").getAsString()); + item.addProperty("output", result.get("output").getAsString()); + + conversation.createItem(item); + } + + // ===== Local tool implementations ===== + + private static String getCurrentWeather(String location) { + return location + "今天天气为霾转晴,气温4/-4℃,微风"; + } + + private static String getFlightPrice(String src, String dst) { + return src + "到" + dst + "的机票价格为200~300美元。"; + } + + private static String getTrainPrice(String src, String dst) { + return "invalid apikey error"; + } +} diff --git a/src/main/java/com/alibaba/dashscope/audio/http_tts/AudioInfo.java b/src/main/java/com/alibaba/dashscope/audio/http_tts/AudioInfo.java new file mode 100644 index 0000000..f47867f --- /dev/null +++ b/src/main/java/com/alibaba/dashscope/audio/http_tts/AudioInfo.java @@ -0,0 +1,63 @@ +// Copyright (c) Alibaba, Inc. and its affiliates. + +package com.alibaba.dashscope.audio.http_tts; + +import lombok.Data; +import lombok.EqualsAndHashCode; + +/** + * Audio information containing URL and metadata for non-SSE synthesis results. When using non-SSE + * synchronous call, the audio is returned as a URL instead of binary data. + * + * @author DashScope SDK Team + */ +@Data +@EqualsAndHashCode +public class AudioInfo { + + /** The audio URL for downloading the synthesized audio file. */ + private String url; + + /** The unique identifier for this audio file. */ + private String id; + + /** The expiration timestamp (Unix timestamp in seconds) for the URL. */ + private Long expiresAt; + + /** The audio data in base64 format (if available). */ + private String data; + + /** + * Checks if this audio info has a valid URL. + * + * @return true if URL is available, false otherwise + */ + public boolean hasUrl() { + return url != null && !url.isEmpty(); + } + + /** + * Checks if the URL has expired. + * + * @return true if expired, false if still valid or expiration unknown + */ + public boolean isExpired() { + if (expiresAt == null) { + return false; + } + return System.currentTimeMillis() / 1000 > expiresAt; + } + + /** + * Gets the remaining time before URL expiration in seconds. + * + * @return remaining seconds, or -1 if expiration unknown or already expired + */ + public long getRemainingSeconds() { + if (expiresAt == null) { + return -1; + } + long remaining = expiresAt - System.currentTimeMillis() / 1000; + return remaining > 0 ? remaining : -1; + } +} diff --git a/src/main/java/com/alibaba/dashscope/audio/http_tts/HttpSpeechSynthesisParam.java b/src/main/java/com/alibaba/dashscope/audio/http_tts/HttpSpeechSynthesisParam.java new file mode 100644 index 0000000..0c9af31 --- /dev/null +++ b/src/main/java/com/alibaba/dashscope/audio/http_tts/HttpSpeechSynthesisParam.java @@ -0,0 +1,110 @@ +// Copyright (c) Alibaba, Inc. and its affiliates. + +package com.alibaba.dashscope.audio.http_tts; + +import com.alibaba.dashscope.base.HalfDuplexServiceParam; +import com.alibaba.dashscope.exception.InputRequiredException; +import com.google.gson.JsonObject; +import java.nio.ByteBuffer; +import lombok.*; +import lombok.experimental.SuperBuilder; + +/** + * HTTP TTS (Text-to-Speech) synthesis parameter class. Supports HTTP SSE-based speech synthesis API + * calls for models like CosyVoice. + * + *

Example usage: + * + *

{@code
+ * HttpSpeechSynthesisParam param = HttpSpeechSynthesisParam.builder()
+ *     .model("cosyvoice-v3-flash")
+ *     .text("你好,欢迎使用语音合成服务。")
+ *     .voice("longanyang")
+ *     .format("wav")
+ *     .sampleRate(24000)
+ *     .build();
+ * }
+ * + * @author DashScope SDK Team + */ +@Data +@SuperBuilder +@EqualsAndHashCode(callSuper = true) +public class HttpSpeechSynthesisParam extends HalfDuplexServiceParam { + + /** The text to be synthesized into speech. */ + @NonNull private String text; + + /** The voice name for synthesis (e.g., "longanyang", "longxiaochun"). */ + private String voice; + + /** The audio format (e.g., "wav", "mp3", "pcm"). */ + @Builder.Default private String format = "wav"; + + /** The sample rate in Hz (e.g., 8000, 16000, 24000, 48000). */ + @Builder.Default private Integer sampleRate = 16000; + + /** The audio volume (0-100). */ + @Builder.Default private Integer volume = 50; + + /** The speech rate (0.5-2.0). */ + @Builder.Default private Float rate = 1.0f; + + /** The pitch rate (0.5-2.0). */ + @Builder.Default private Float pitch = 1.0f; + + @Override + public JsonObject getHttpBody() { + JsonObject body = new JsonObject(); + body.addProperty("model", getModel()); + + // Build input object + JsonObject input = new JsonObject(); + input.addProperty("text", text); + + if (voice != null && !voice.isEmpty()) { + input.addProperty("voice", voice); + } + if (format != null && !format.isEmpty()) { + input.addProperty("format", format); + } + if (sampleRate != null) { + input.addProperty("sample_rate", sampleRate); + } + if (volume != null) { + input.addProperty("volume", volume); + } + if (rate != null) { + input.addProperty("rate", rate); + } + if (pitch != null) { + input.addProperty("pitch", pitch); + } + + body.add("input", input); + + return body; + } + + @Override + public Object getInput() { + JsonObject input = new JsonObject(); + input.addProperty("text", text); + return input; + } + + @Override + public ByteBuffer getBinaryData() { + return null; + } + + @Override + public void validate() throws InputRequiredException { + if (text == null || text.trim().isEmpty()) { + throw new InputRequiredException("text is required and cannot be empty"); + } + if (getModel() == null || getModel().trim().isEmpty()) { + throw new InputRequiredException("model is required"); + } + } +} diff --git a/src/main/java/com/alibaba/dashscope/audio/http_tts/HttpSpeechSynthesisResult.java b/src/main/java/com/alibaba/dashscope/audio/http_tts/HttpSpeechSynthesisResult.java new file mode 100644 index 0000000..0f733c1 --- /dev/null +++ b/src/main/java/com/alibaba/dashscope/audio/http_tts/HttpSpeechSynthesisResult.java @@ -0,0 +1,75 @@ +// Copyright (c) Alibaba, Inc. and its affiliates. + +package com.alibaba.dashscope.audio.http_tts; + +import com.alibaba.dashscope.audio.tts.SpeechSynthesisUsage; +import com.google.gson.JsonObject; +import lombok.Data; +import lombok.EqualsAndHashCode; + +/** + * Result class for HTTP TTS synthesis. Contains the synthesized audio data and related metadata. + * + *

For SSE streaming calls, the result contains binary audio data in {@link #audioData}. For + * non-SSE synchronous calls, the result contains an audio URL in {@link #audioInfo}. + * + * @author DashScope SDK Team + */ +@Data +@EqualsAndHashCode +public class HttpSpeechSynthesisResult { + + /** The request ID for tracking. */ + private String requestId; + + /** The audio data in binary format (for SSE streaming calls). */ + private byte[] audioData; + + /** The audio URL and metadata (for non-SSE synchronous calls). */ + private AudioInfo audioInfo; + + /** The usage statistics (if available). */ + private SpeechSynthesisUsage usage; + + /** The raw output from the API (may contain additional metadata). */ + private JsonObject output; + + /** The finish reason (e.g., "stop"). */ + private String finishReason; + + /** + * Checks if audio data is present in this result (SSE mode). + * + * @return true if audio data is available, false otherwise + */ + public boolean hasAudioData() { + return audioData != null && audioData.length > 0; + } + + /** + * Gets the size of the audio data in bytes. + * + * @return the size in bytes, or 0 if no audio data is present + */ + public int getAudioDataSize() { + return audioData != null ? audioData.length : 0; + } + + /** + * Checks if audio URL is present in this result (non-SSE mode). + * + * @return true if audio URL is available, false otherwise + */ + public boolean hasAudioUrl() { + return audioInfo != null && audioInfo.hasUrl(); + } + + /** + * Gets the audio URL. + * + * @return the audio URL, or null if not available + */ + public String getAudioUrl() { + return audioInfo != null ? audioInfo.getUrl() : null; + } +} diff --git a/src/main/java/com/alibaba/dashscope/audio/http_tts/HttpSpeechSynthesizer.java b/src/main/java/com/alibaba/dashscope/audio/http_tts/HttpSpeechSynthesizer.java new file mode 100644 index 0000000..1e7133d --- /dev/null +++ b/src/main/java/com/alibaba/dashscope/audio/http_tts/HttpSpeechSynthesizer.java @@ -0,0 +1,403 @@ +// Copyright (c) Alibaba, Inc. and its affiliates. + +package com.alibaba.dashscope.audio.http_tts; + +import com.alibaba.dashscope.api.SynchronizeHalfDuplexApi; +import com.alibaba.dashscope.audio.tts.SpeechSynthesisUsage; +import com.alibaba.dashscope.common.*; +import com.alibaba.dashscope.common.Status; +import com.alibaba.dashscope.exception.ApiException; +import com.alibaba.dashscope.exception.InputRequiredException; +import com.alibaba.dashscope.exception.NoApiKeyException; +import com.alibaba.dashscope.protocol.ApiServiceOption; +import com.alibaba.dashscope.protocol.HttpMethod; +import com.alibaba.dashscope.protocol.Protocol; +import com.alibaba.dashscope.protocol.StreamingMode; +import com.alibaba.dashscope.utils.JsonUtils; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import io.reactivex.Flowable; +import java.io.ByteArrayOutputStream; +import java.nio.ByteBuffer; +import java.util.Base64; +import lombok.extern.slf4j.Slf4j; + +/** + * HTTP-based Speech Synthesizer using Server-Sent Events (SSE). This class provides a simple + * interface for text-to-speech synthesis via HTTP SSE protocol. + * + *

Supports models like CosyVoice (cosyvoice-v3-flash, etc.) that use HTTP SSE for streaming + * synthesis. + * + * @author songsong.sss + */ +@Slf4j +public class HttpSpeechSynthesizer { + + private final ThreadLocal accumulatedAudioData = new ThreadLocal<>(); + + /** Creates a new HttpSpeechSynthesizer instance with default settings. */ + public HttpSpeechSynthesizer() {} + + public ByteBuffer getAccumulatedAudioData() { + return accumulatedAudioData.get(); + } + + /** + * Creates a per-request ApiServiceOption with the specified SSE setting. + */ + private SynchronizeHalfDuplexApi createApi(boolean isSSE) { + ApiServiceOption serviceOption = + ApiServiceOption.builder() + .protocol(Protocol.HTTP) + .httpMethod(HttpMethod.POST) + .streamingMode(StreamingMode.OUT) + .outputMode(OutputMode.ACCUMULATE) + .taskGroup(TaskGroup.AUDIO.getValue()) + .task(Task.TEXT_TO_SPEECH.getValue()) + .function(Function.SPEECH_SYNTHESIZER.getValue()) + .isSSE(isSSE) + .build(); + return new SynchronizeHalfDuplexApi<>(serviceOption); + } + + /** + * Synchronous call with SSE enabled - synthesizes speech and returns complete audio data. This + * method blocks until the synthesis is complete. + * + *

Use this method when you need the audio data streamed back in real-time. + * + * @param param The synthesis parameters + * @return ByteBuffer containing the complete audio data + * @throws ApiException If the API call fails + * @throws NoApiKeyException If the API key is not configured + * @throws InputRequiredException If required parameters are missing + */ + public ByteBuffer callAndReturnAudio(HttpSpeechSynthesisParam param) + throws ApiException, NoApiKeyException, InputRequiredException { + param.validate(); + accumulatedAudioData.remove(); + + SynchronizeHalfDuplexApi api = createApi(true); + ByteArrayOutputStream audioBuffer = new ByteArrayOutputStream(); + + try { + Flowable flowable = api.streamCall(param); + + flowable.blockingForEach( + result -> { + processAudioResult(result, audioBuffer); + }); + + ByteBuffer audioData = ByteBuffer.wrap(audioBuffer.toByteArray()); + accumulatedAudioData.set(audioData); + return audioData; + + } catch (ApiException | NoApiKeyException e) { + throw e; + } catch (Exception e) { + log.error("Speech synthesis failed", e); + throw new ApiException(e); + } + } + + /** + * Synchronous call without SSE - returns a result containing the audio URL. This is a simpler, + * faster call that returns a download URL instead of streaming audio data. + * + *

Use this method when you want to get the audio URL and download it later. + * + *

Example response: + * + *

{@code
+   * {
+   *   "request_id": "xxx",
+   *   "output": {
+   *     "finish_reason": "stop",
+   *     "audio": {
+   *       "url": "http://dashscope-result-bj.oss-cn-beijing.aliyuncs.com/...",
+   *       "id": "audio_xxx",
+   *       "expires_at": 1772697707
+   *     }
+   *   },
+   *   "usage": { "characters": 15 }
+   * }
+   * }
+ * + * @param param The synthesis parameters + * @return HttpSpeechSynthesisResult containing audio URL and metadata + * @throws ApiException If the API call fails + * @throws NoApiKeyException If the API key is not configured + * @throws InputRequiredException If required parameters are missing + */ + public HttpSpeechSynthesisResult call(HttpSpeechSynthesisParam param) + throws ApiException, NoApiKeyException, InputRequiredException { + param.validate(); + accumulatedAudioData.remove(); + + SynchronizeHalfDuplexApi api = createApi(false); + try { + DashScopeResult result = api.call(param); + return convertNonSSEResult(result); + } catch (Exception e) { + log.error("Synchronous speech synthesis failed", e); + throw new ApiException(e); + } + } + + /** + * Streaming call with callback interface. Results are delivered through the callback as they + * arrive. + * + * @param param The synthesis parameters + * @param callback The callback to receive synthesis results + * @throws ApiException If the API call fails + * @throws NoApiKeyException If the API key is not configured + * @throws InputRequiredException If required parameters are missing + */ + public void streamCall( + HttpSpeechSynthesisParam param, ResultCallback callback) + throws ApiException, NoApiKeyException, InputRequiredException { + param.validate(); + accumulatedAudioData.remove(); + + SynchronizeHalfDuplexApi api = createApi(true); + ByteArrayOutputStream audioBuffer = new ByteArrayOutputStream(); + + try { + api.streamCall( + param, + new ResultCallback() { + @Override + public void onEvent(DashScopeResult message) { + try { + HttpSpeechSynthesisResult result = convertResult(message); + if (result.getAudioData() != null) { + audioBuffer.write(result.getAudioData()); + } + callback.onEvent(result); + } catch (Exception e) { + log.error("Failed to process audio result", e); + callback.onError(e); + } + } + + @Override + public void onComplete() { + try { + accumulatedAudioData.set(ByteBuffer.wrap(audioBuffer.toByteArray())); + callback.onComplete(); + } catch (Exception e) { + log.error("Failed to complete synthesis", e); + callback.onError(e); + } + } + + @Override + public void onError(Exception e) { + callback.onError(e); + } + }); + } catch (Exception e) { + log.error("Streaming call failed", e); + throw new ApiException(e); + } + } + + /** + * Gets the first package delay (time from request to first audio data). Only available after a + * call has been made. + * + * @return The delay in milliseconds, or -1 if not available + */ + public long getFirstPackageDelay() { + // This would require timestamp tracking during the call + return -1; + } + + /** Processes audio result from DashScope API response. */ + private void processAudioResult(DashScopeResult result, ByteArrayOutputStream audioBuffer) { + if (result == null) { + return; + } + + try { + byte[] audioBytes = extractAudioData(result); + if (audioBytes != null && audioBytes.length > 0) { + audioBuffer.write(audioBytes); + } + } catch (Exception e) { + log.error("Failed to extract audio data from result", e); + } + } + + /** Converts DashScopeResult to HttpSpeechSynthesisResult. */ + private HttpSpeechSynthesisResult convertResult(DashScopeResult dashScopeResult) { + // Check for API error response + if (dashScopeResult.getCode() != null && !dashScopeResult.getCode().isEmpty()) { + String errorMsg = + dashScopeResult.getMessage() != null ? dashScopeResult.getMessage() : "Unknown error"; + Status status = + Status.builder() + .statusCode( + dashScopeResult.getStatusCode() != null ? dashScopeResult.getStatusCode() : 400) + .code(dashScopeResult.getCode()) + .message(errorMsg) + .requestId(dashScopeResult.getRequestId()) + .build(); + throw new ApiException(status); + } + + HttpSpeechSynthesisResult result = new HttpSpeechSynthesisResult(); + + if (dashScopeResult.getRequestId() != null) { + result.setRequestId(dashScopeResult.getRequestId()); + } + + byte[] audioData = extractAudioData(dashScopeResult); + if (audioData != null) { + result.setAudioData(audioData); + } + + if (dashScopeResult.getUsage() != null) { + try { + SpeechSynthesisUsage usage = + JsonUtils.fromJsonObject( + dashScopeResult.getUsage().getAsJsonObject(), SpeechSynthesisUsage.class); + result.setUsage(usage); + } catch (Exception e) { + log.debug("Failed to parse usage information", e); + } + } + + if (dashScopeResult.getOutput() != null && dashScopeResult.getOutput() instanceof JsonObject) { + result.setOutput((JsonObject) dashScopeResult.getOutput()); + } + + return result; + } + + /** + * Extracts audio data from DashScope API response. The audio data is typically Base64-encoded in + * the response. + */ + private byte[] extractAudioData(DashScopeResult result) { + if (result == null) { + return null; + } + + // Try to get audio from output (Base64 encoded) + if (result.getOutput() != null && result.getOutput() instanceof JsonObject) { + JsonObject output = (JsonObject) result.getOutput(); + + // Try common audio field names + if (output.has("audio")) { + JsonElement audioElement = output.get("audio"); + // audio could be a Base64 string or a JSON object with data field + if (audioElement.isJsonPrimitive()) { + String audioBase64 = audioElement.getAsString(); + if (audioBase64 != null && !audioBase64.isEmpty()) { + try { + return Base64.getDecoder().decode(audioBase64); + } catch (IllegalArgumentException e) { + log.warn("Failed to decode Base64 audio data", e); + } + } + } else if (audioElement.isJsonObject()) { + // audio is an object with fields like url, id, data + JsonObject audioObj = audioElement.getAsJsonObject(); + if (audioObj.has("data") && !audioObj.get("data").isJsonNull()) { + String audioBase64 = audioObj.get("data").getAsString(); + if (audioBase64 != null && !audioBase64.isEmpty()) { + try { + return Base64.getDecoder().decode(audioBase64); + } catch (IllegalArgumentException e) { + log.warn("Failed to decode Base64 audio data from audio.data", e); + } + } + } + } + } + + // Some APIs may return audio in binary field + if (output.has("binary")) { + String binaryBase64 = output.get("binary").getAsString(); + if (binaryBase64 != null && !binaryBase64.isEmpty()) { + try { + return Base64.getDecoder().decode(binaryBase64); + } catch (IllegalArgumentException e) { + log.warn("Failed to decode Base64 binary data", e); + } + } + } + } + + // Check if output is ByteBuffer (WebSocket-style) + if (result.getOutput() instanceof ByteBuffer) { + ByteBuffer buffer = (ByteBuffer) result.getOutput(); + byte[] data = new byte[buffer.remaining()]; + buffer.get(data); + return data; + } + + return null; + } + + /** + * Converts DashScopeResult from non-SSE call to HttpSpeechSynthesisResult. Non-SSE call returns + * audio URL instead of binary data. + */ + private HttpSpeechSynthesisResult convertNonSSEResult(DashScopeResult dashScopeResult) { + HttpSpeechSynthesisResult result = new HttpSpeechSynthesisResult(); + + if (dashScopeResult.getRequestId() != null) { + result.setRequestId(dashScopeResult.getRequestId()); + } + + // Parse output for audio URL information + if (dashScopeResult.getOutput() != null && dashScopeResult.getOutput() instanceof JsonObject) { + JsonObject output = (JsonObject) dashScopeResult.getOutput(); + result.setOutput(output); + + // Parse finish_reason + if (output.has("finish_reason")) { + result.setFinishReason(output.get("finish_reason").getAsString()); + } + + // Parse audio object (contains url, id, expires_at) + if (output.has("audio") && output.get("audio").isJsonObject()) { + JsonObject audio = output.getAsJsonObject("audio"); + AudioInfo audioInfo = new AudioInfo(); + + if (audio.has("url")) { + audioInfo.setUrl(audio.get("url").getAsString()); + } + if (audio.has("id")) { + audioInfo.setId(audio.get("id").getAsString()); + } + if (audio.has("expires_at")) { + audioInfo.setExpiresAt(audio.get("expires_at").getAsLong()); + } + if (audio.has("data") && !audio.get("data").isJsonNull()) { + audioInfo.setData(audio.get("data").getAsString()); + } + + result.setAudioInfo(audioInfo); + } + } + + // Parse usage + if (dashScopeResult.getUsage() != null) { + try { + SpeechSynthesisUsage usage = + JsonUtils.fromJsonObject( + dashScopeResult.getUsage().getAsJsonObject(), SpeechSynthesisUsage.class); + result.setUsage(usage); + } catch (Exception e) { + log.debug("Failed to parse usage information", e); + } + } + + return result; + } +} diff --git a/src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConstants.java b/src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConstants.java index dd5169c..f942f2b 100644 --- a/src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConstants.java +++ b/src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConstants.java @@ -36,6 +36,7 @@ public class OmniRealtimeConstants { public static final String PROTOCOL_EVENT_TYPE_CREATE_RESPONSE = "response.create"; public static final String PROTOCOL_EVENT_TYPE_CANCEL_RESPONSE = "response.cancel"; public static final String PROTOCOL_EVENT_TYPE_FINISH_SESSION = "session.finish"; + public static final String PROTOCOL_EVENT_TYPE_ITEM_CREATE = "conversation.item.create"; public static final String PROTOCOL_RESPONSE_TYPE_SESSION_CREATED = "session.created"; public static final String PROTOCOL_RESPONSE_TYPE_RESPONSE_CREATED = "response.created"; public static final String PROTOCOL_RESPONSE_TYPE_AUDIO_TRANSCRIPT_DELTA = diff --git a/src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConversation.java b/src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConversation.java index 80178bc..2b2dcfd 100644 --- a/src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConversation.java +++ b/src/main/java/com/alibaba/dashscope/audio/omni/OmniRealtimeConversation.java @@ -14,6 +14,7 @@ import java.util.List; import java.util.Map; import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import lombok.extern.slf4j.Slf4j; @@ -71,7 +72,9 @@ public void connect() throws NoApiKeyException, InterruptedException { client = OkHttpClientFactory.getOkHttpClient(); websocktetClient = client.newWebSocket(request, this); connectLatch.set(new CountDownLatch(1)); - connectLatch.get().await(); + if (!connectLatch.get().await(60, TimeUnit.SECONDS)) { + throw new RuntimeException("Connection timed out after 60 seconds"); + } } // block wait server session done, max 20 seconds, then close connection @@ -128,6 +131,21 @@ public void updateSession(OmniRealtimeConfig config) { sendMessage(createGson().toJson(update_request), true); } + /** + * send item to server by event conversation.item.create + * + * @param item item pass to server + */ + public void createItem(JsonObject item) { + checkStatus(); + Map item_request = new HashMap<>(); + item_request.put(OmniRealtimeConstants.PROTOCOL_EVENT_ID, generateEventId()); + item_request.put( + OmniRealtimeConstants.PROTOCOL_TYPE, OmniRealtimeConstants.PROTOCOL_EVENT_TYPE_ITEM_CREATE); + item_request.put("item", item); + sendMessage(createGson().toJson(item_request), true); + } + /** * send audio in base64 format * @@ -399,6 +417,7 @@ public void onClosed(WebSocket webSocket, int code, String reason) { @Override public void onFailure(WebSocket webSocket, Throwable t, Response response) { + connectLatch.get().countDown(); log.error("WebSocket failed: " + t.getMessage()); } diff --git a/src/main/java/com/alibaba/dashscope/audio/qwen_tts_realtime/QwenTtsRealtime.java b/src/main/java/com/alibaba/dashscope/audio/qwen_tts_realtime/QwenTtsRealtime.java index a4dc9ba..c99bfdd 100644 --- a/src/main/java/com/alibaba/dashscope/audio/qwen_tts_realtime/QwenTtsRealtime.java +++ b/src/main/java/com/alibaba/dashscope/audio/qwen_tts_realtime/QwenTtsRealtime.java @@ -13,6 +13,7 @@ import java.util.HashMap; import java.util.Map; import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import lombok.extern.slf4j.Slf4j; @@ -68,7 +69,9 @@ public void connect() throws NoApiKeyException, InterruptedException { client = OkHttpClientFactory.getOkHttpClient(); websocktetClient = client.newWebSocket(request, this); connectLatch.set(new CountDownLatch(1)); - connectLatch.get().await(); + if (!connectLatch.get().await(60, TimeUnit.SECONDS)) { + throw new RuntimeException("Connection timed out after 60 seconds"); + } } /** @@ -303,6 +306,7 @@ public void onClosing(@NotNull WebSocket webSocket, int code, @NotNull String re @Override public void onFailure(WebSocket webSocket, Throwable t, Response response) { + connectLatch.get().countDown(); log.error("WebSocket failed: " + t.getMessage()); } } diff --git a/src/main/java/com/alibaba/dashscope/protocol/okhttp/OkHttpWebSocketClient.java b/src/main/java/com/alibaba/dashscope/protocol/okhttp/OkHttpWebSocketClient.java index bb13dae..1519971 100644 --- a/src/main/java/com/alibaba/dashscope/protocol/okhttp/OkHttpWebSocketClient.java +++ b/src/main/java/com/alibaba/dashscope/protocol/okhttp/OkHttpWebSocketClient.java @@ -132,7 +132,7 @@ private void establishWebSocketClient( }, BackpressureStrategy.BUFFER); // wait for connection establish - flowable.blockingSubscribe(); + flowable.timeout(60, TimeUnit.SECONDS).blockingSubscribe(); return; } catch (Throwable ex) { reconnectionTimes += 1;