From 1d1514fe35fd26ba9e275b623d007993c599279e Mon Sep 17 00:00:00 2001 From: SoufianeBouaddis Date: Thu, 26 Mar 2026 23:06:05 +0100 Subject: [PATCH 1/3] feat telegram: add voice message support for telegram with pluggable transcription --- .gitignore | 1 + .../speech/MockSpeechToTextService.java | 16 ++++++ .../speech/OpenAiSpeechToTextService.java | 17 ++++++ .../javaclaw/speech/SpeechToTextService.java | 8 +++ .../channels/telegram/TelegramChannel.java | 53 +++++++++++++++---- .../TelegramChannelAutoConfiguration.java | 6 ++- .../telegram/TelegramChannelTest.java | 14 +++-- 7 files changed, 99 insertions(+), 16 deletions(-) create mode 100644 base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java create mode 100644 base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java create mode 100644 base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java diff --git a/.gitignore b/.gitignore index 0485df78..3c6c38a0 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ workspace/skills/* workspace/tasks/* workspace/app.* *.private* +workspace/conversations/* \ No newline at end of file diff --git a/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java new file mode 100644 index 00000000..8d941062 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java @@ -0,0 +1,16 @@ +package ai.javaclaw.speech; + +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import java.io.InputStream; + +@Service +@ConditionalOnProperty(name = "speech.provider", havingValue = "mock", matchIfMissing = true) +public class MockSpeechToTextService implements SpeechToTextService { + + @Override + public String transcribe(InputStream audioStream) { + return "[voice message]"; + } +} diff --git a/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java new file mode 100644 index 00000000..ec64d2af --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java @@ -0,0 +1,17 @@ +package ai.javaclaw.speech; + +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Service; + +import java.io.InputStream; + +@Service +@ConditionalOnProperty(name = "speech.provider", havingValue = "openai") +public class OpenAiSpeechToTextService implements SpeechToTextService { + + @Override + public String transcribe(InputStream audioStream) { + // TODO: integrate OpenAI Whisper API + return "[transcribed via OpenAI]"; + } +} diff --git a/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java new file mode 100644 index 00000000..ac9f8606 --- /dev/null +++ b/base/src/main/java/ai/javaclaw/speech/SpeechToTextService.java @@ -0,0 +1,8 @@ +package ai.javaclaw.speech; + +import java.io.InputStream; + +public interface SpeechToTextService { + + String transcribe(InputStream audioStream); +} diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java index fbc2af38..8052cf6f 100644 --- a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannel.java @@ -4,18 +4,24 @@ import ai.javaclaw.channels.Channel; import ai.javaclaw.channels.ChannelMessageReceivedEvent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.telegram.telegrambots.client.okhttp.OkHttpTelegramClient; import org.telegram.telegrambots.longpolling.interfaces.LongPollingUpdateConsumer; import org.telegram.telegrambots.longpolling.starter.SpringLongPollingBot; import org.telegram.telegrambots.longpolling.util.LongPollingSingleThreadUpdateConsumer; +import org.telegram.telegrambots.meta.api.methods.GetFile; import org.telegram.telegrambots.meta.api.methods.send.SendMessage; import org.telegram.telegrambots.meta.api.objects.Update; import org.telegram.telegrambots.meta.api.objects.message.Message; import org.telegram.telegrambots.meta.exceptions.TelegramApiException; import org.telegram.telegrambots.meta.generics.TelegramClient; +import java.io.IOException; +import java.io.InputStream; +import java.net.URI; + import static java.util.Optional.ofNullable; public class TelegramChannel implements Channel, SpringLongPollingBot, LongPollingSingleThreadUpdateConsumer { @@ -26,19 +32,21 @@ public class TelegramChannel implements Channel, SpringLongPollingBot, LongPolli private final TelegramClient telegramClient; private final Agent agent; private final ChannelRegistry channelRegistry; + private final SpeechToTextService speechToTextService; private Long chatId; private Integer messageThreadId; - public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry) { - this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry); + public TelegramChannel(String botToken, String allowedUsername, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) { + this(botToken, allowedUsername, new OkHttpTelegramClient(botToken), agent, channelRegistry, speechToTextService); } - TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry) { + TelegramChannel(String botToken, String allowedUsername, TelegramClient telegramClient, Agent agent, ChannelRegistry channelRegistry, SpeechToTextService speechToTextService) { this.botToken = botToken; this.allowedUsername = normalizeUsername(allowedUsername); this.telegramClient = telegramClient; this.agent = agent; this.channelRegistry = channelRegistry; + this.speechToTextService = speechToTextService; channelRegistry.registerChannel(this); log.info("Started Telegram integration"); } @@ -55,7 +63,7 @@ public LongPollingUpdateConsumer getUpdatesConsumer() { @Override public void consume(Update update) { - if (!(update.hasMessage() && update.getMessage().hasText())) return; + if (!update.hasMessage()) return; Message requestMessage = update.getMessage(); String userName = requestMessage.getFrom() == null ? null : requestMessage.getFrom().getUserName(); @@ -65,12 +73,29 @@ public void consume(Update update) { return; } - String messageText = requestMessage.getText(); - this.chatId = requestMessage.getChatId(); - this.messageThreadId = requestMessage.getMessageThreadId(); - channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId)); - String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText); - sendMessage(chatId, messageThreadId, response); + if (requestMessage.hasText()) { + // Check text message + String messageText = requestMessage.getText(); + this.chatId = requestMessage.getChatId(); + this.messageThreadId = requestMessage.getMessageThreadId(); + channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), messageText, chatId, messageThreadId)); + String response = agent.respondTo(getConversationId(chatId, messageThreadId), messageText); + sendMessage(chatId, messageThreadId, response); + } else if (requestMessage.hasVoice()) { + // Check voice message & download it + log.info("Voice message received"); + try (InputStream voiceStream = downloadVoice(requestMessage)) { + String transcribedText = speechToTextService.transcribe(voiceStream); + log.info("Voice message transcribed: {}", transcribedText); + this.chatId = requestMessage.getChatId(); + this.messageThreadId = requestMessage.getMessageThreadId(); + channelRegistry.publishMessageReceivedEvent(new TelegramChannelMessageReceivedEvent(getName(), transcribedText, chatId, messageThreadId)); + String response = agent.respondTo(getConversationId(chatId, messageThreadId), transcribedText); + sendMessage(chatId, messageThreadId, response); + } catch (IOException | TelegramApiException e) { + log.error("Failed to process voice message", e); + } + } } @Override @@ -95,6 +120,14 @@ public void sendMessage(long chatId, Integer messageThreadId, String message) { } } + private InputStream downloadVoice(Message message) throws TelegramApiException, IOException { + String fileId = message.getVoice().getFileId(); + GetFile getFile = new GetFile(fileId); + String filePath = telegramClient.execute(getFile).getFilePath(); + String fileUrl = "https://api.telegram.org/file/bot" + botToken + "/" + filePath; + return URI.create(fileUrl).toURL().openStream(); + } + private boolean isAllowedUser(String userName) { String normalizedUserName = normalizeUsername(userName); return normalizedUserName != null && normalizedUserName.equalsIgnoreCase(allowedUsername); diff --git a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java index 76bb8c49..a007c610 100644 --- a/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java +++ b/plugins/telegram/src/main/java/ai/javaclaw/channels/telegram/TelegramChannelAutoConfiguration.java @@ -3,6 +3,7 @@ import ai.javaclaw.agent.Agent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.autoconfigure.AutoConfiguration; import org.springframework.boot.autoconfigure.condition.ConditionalOnMissingBean; @@ -19,7 +20,8 @@ public class TelegramChannelAutoConfiguration { public TelegramChannel telegramChannel(@Value("${agent.channels.telegram.token:null}") String botToken, @Value("${agent.channels.telegram.username:null}") String allowedUsername, Agent agent, - ChannelRegistry channelRegistry) { - return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry); + ChannelRegistry channelRegistry, + SpeechToTextService speechToTextService) { + return new TelegramChannel(botToken, allowedUsername, agent, channelRegistry, speechToTextService); } } diff --git a/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java b/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java index adbcc525..e8b5c5f2 100644 --- a/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java +++ b/plugins/telegram/src/test/java/ai/javaclaw/channels/telegram/TelegramChannelTest.java @@ -2,6 +2,7 @@ import ai.javaclaw.agent.Agent; import ai.javaclaw.channels.ChannelRegistry; +import ai.javaclaw.speech.SpeechToTextService; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; @@ -31,6 +32,9 @@ class TelegramChannelTest { @Mock private Agent agent; + @Mock + private SpeechToTextService speechToTextService; + // ----------------------------------------------------------------------- // Ignored updates // ----------------------------------------------------------------------- @@ -47,13 +51,17 @@ void ignoresUpdatesWithoutMessage() { } @Test - void ignoresUpdatesWithoutText() { + void ignoresUpdatesWithoutTextOrVoice() { TelegramChannel channel = channel("allowed_user"); Update update = mock(Update.class); Message message = mock(Message.class); + User user = mock(User.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); + when(message.getFrom()).thenReturn(user); + when(user.getUserName()).thenReturn("allowed_user"); when(message.hasText()).thenReturn(false); + when(message.hasVoice()).thenReturn(false); channel.consume(update); @@ -67,7 +75,6 @@ void ignoresMessagesFromNullUsername() { Message message = mock(Message.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); - when(message.hasText()).thenReturn(true); when(message.getFrom()).thenReturn(null); channel.consume(update); @@ -195,7 +202,7 @@ void sendMessageDoesNothingWhenNoChatIdKnown() { // ----------------------------------------------------------------------- private TelegramChannel channel(String allowedUsername) { - return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry()); + return new TelegramChannel("token", allowedUsername, telegramClient, agent, new ChannelRegistry(), speechToTextService); } private Update updateFromUnknownUser(String username) { @@ -204,7 +211,6 @@ private Update updateFromUnknownUser(String username) { User user = mock(User.class); when(update.hasMessage()).thenReturn(true); when(update.getMessage()).thenReturn(message); - when(message.hasText()).thenReturn(true); when(message.getFrom()).thenReturn(user); when(user.getUserName()).thenReturn(username); return update; From 16edc98c127889551808a6d8d96bb916bfcea287 Mon Sep 17 00:00:00 2001 From: SoufianeBouaddis Date: Mon, 27 Apr 2026 20:16:32 +0100 Subject: [PATCH 2/3] Move OpenAiSpeechToTextService.java to base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService and refactor OpenAiSpeechToTextService to deletate it to SpringAI --- .../speech/OpenAiSpeechToTextService.java | 111 ------------------ .../speech/MockSpeechToTextService.java | 5 - .../openai/OpenAiSpeechToTextService.java | 44 +++++++ 3 files changed, 44 insertions(+), 116 deletions(-) delete mode 100644 base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java rename base/src/{main => test}/java/ai/javaclaw/speech/MockSpeechToTextService.java (51%) create mode 100644 providers/openai/src/main/java/ai/javaclaw/providers/openai/OpenAiSpeechToTextService.java diff --git a/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java b/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java deleted file mode 100644 index cb438c0a..00000000 --- a/base/src/main/java/ai/javaclaw/speech/OpenAiSpeechToTextService.java +++ /dev/null @@ -1,111 +0,0 @@ -package ai.javaclaw.speech; - -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonToken; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Value; -import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; -import org.springframework.stereotype.Service; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URI; -import java.net.http.HttpClient; -import java.net.http.HttpRequest; -import java.net.http.HttpResponse; -import java.util.UUID; - -@Service -@ConditionalOnProperty(name = "speech.provider", havingValue = "openai") -public class OpenAiSpeechToTextService implements SpeechToTextService { - - private static final Logger LOGGER = LoggerFactory.getLogger(OpenAiSpeechToTextService.class); - private static final JsonFactory JSON_FACTORY = new JsonFactory(); - - private final HttpClient httpClient; - private final String apiKey; - private final String model; - private final String baseUrl; - - public OpenAiSpeechToTextService( - @Value("${spring.ai.openai.api-key}") String apiKey, - @Value("${speech.openai.model:whisper-1}") String model, - @Value("${speech.openai.base-url:https://api.openai.com/v1}") String baseUrl) { - this.apiKey = apiKey; - this.model = model; - this.baseUrl = baseUrl; - this.httpClient = HttpClient.newHttpClient(); - } - - @Override - public String transcribe(InputStream audioStream) { - LOGGER.info("Transcribing audio via OpenAI Whisper (model: {})", model); - - try { - byte[] audioBytes = audioStream.readAllBytes(); - String boundary = UUID.randomUUID().toString(); - byte[] body = buildMultipartBody(boundary, audioBytes); - - HttpRequest request = HttpRequest.newBuilder() - .uri(URI.create(baseUrl + "/audio/transcriptions")) - .header("Authorization", "Bearer " + apiKey) - .header("Content-Type", "multipart/form-data; boundary=" + boundary) - .POST(HttpRequest.BodyPublishers.ofByteArray(body)) - .build(); - - HttpResponse response = httpClient.send(request, HttpResponse.BodyHandlers.ofString()); - - if (response.statusCode() != 200) { - throw new SpeechToTextException("OpenAI API returned status " + response.statusCode() + ": " + response.body()); - } - - String text = extractTextField(response.body()); - if (text == null || text.isBlank()) { - throw new SpeechToTextException("OpenAI returned empty transcription"); - } - - LOGGER.info("OpenAI transcription completed successfully"); - return text.trim(); - - } catch (IOException | InterruptedException e) { - if (e instanceof InterruptedException) { - Thread.currentThread().interrupt(); - } - throw new SpeechToTextException("Failed to call OpenAI transcription API", e); - } - } - - private byte[] buildMultipartBody(String boundary, byte[] audioBytes) { - String crlf = "\r\n"; - - byte[] header = ("--" + boundary + crlf - + "Content-Disposition: form-data; name=\"file\"; filename=\"voice.ogg\"" + crlf - + "Content-Type: audio/ogg" + crlf + crlf).getBytes(); - - byte[] footer = (crlf + "--" + boundary + crlf - + "Content-Disposition: form-data; name=\"model\"" + crlf + crlf - + model + crlf - + "--" + boundary + "--" + crlf).getBytes(); - - byte[] body = new byte[header.length + audioBytes.length + footer.length]; - System.arraycopy(header, 0, body, 0, header.length); - System.arraycopy(audioBytes, 0, body, header.length, audioBytes.length); - System.arraycopy(footer, 0, body, header.length + audioBytes.length, footer.length); - - return body; - } - - private String extractTextField(String json) throws IOException { - try (JsonParser parser = JSON_FACTORY.createParser(json)) { - while (parser.nextToken() != null) { - if (parser.currentToken() == JsonToken.FIELD_NAME && "text".equals(parser.currentName())) { - parser.nextToken(); - return parser.getValueAsString(); - } - } - } - return null; - } -} diff --git a/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java b/base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService.java similarity index 51% rename from base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java rename to base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService.java index 8d941062..f4dfcd02 100644 --- a/base/src/main/java/ai/javaclaw/speech/MockSpeechToTextService.java +++ b/base/src/test/java/ai/javaclaw/speech/MockSpeechToTextService.java @@ -1,12 +1,7 @@ package ai.javaclaw.speech; -import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; -import org.springframework.stereotype.Service; - import java.io.InputStream; -@Service -@ConditionalOnProperty(name = "speech.provider", havingValue = "mock", matchIfMissing = true) public class MockSpeechToTextService implements SpeechToTextService { @Override diff --git a/providers/openai/src/main/java/ai/javaclaw/providers/openai/OpenAiSpeechToTextService.java b/providers/openai/src/main/java/ai/javaclaw/providers/openai/OpenAiSpeechToTextService.java new file mode 100644 index 00000000..2d354b8d --- /dev/null +++ b/providers/openai/src/main/java/ai/javaclaw/providers/openai/OpenAiSpeechToTextService.java @@ -0,0 +1,44 @@ +package ai.javaclaw.providers.openai; + +import ai.javaclaw.speech.SpeechToTextException; +import ai.javaclaw.speech.SpeechToTextService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.ai.audio.transcription.AudioTranscriptionPrompt; +import org.springframework.ai.audio.transcription.AudioTranscriptionResponse; +import org.springframework.ai.openai.OpenAiAudioTranscriptionModel; +import org.springframework.boot.autoconfigure.condition.ConditionalOnBean; +import org.springframework.core.io.InputStreamResource; +import org.springframework.stereotype.Service; + +import java.io.InputStream; + +@Service +@ConditionalOnBean(OpenAiAudioTranscriptionModel.class) +public class OpenAiSpeechToTextService implements SpeechToTextService { + + private static final Logger LOGGER = LoggerFactory.getLogger(OpenAiSpeechToTextService.class); + + private final OpenAiAudioTranscriptionModel transcriptionModel; + + public OpenAiSpeechToTextService(OpenAiAudioTranscriptionModel transcriptionModel) { + this.transcriptionModel = transcriptionModel; + } + + @Override + public String transcribe(InputStream audioStream) { + LOGGER.info("Transcribing audio via Spring AI OpenAI transcription"); + + AudioTranscriptionResponse response = transcriptionModel.call( + new AudioTranscriptionPrompt(new InputStreamResource(audioStream)) + ); + + String text = response.getResult().getOutput(); + if (text == null || text.isBlank()) { + throw new SpeechToTextException("OpenAI returned empty transcription"); + } + + LOGGER.info("OpenAI transcription completed successfully"); + return text.trim(); + } +} From 6b3d09ba234169b91ec8c2ed079799cbc7039fc1 Mon Sep 17 00:00:00 2001 From: SoufianeBouaddis Date: Mon, 27 Apr 2026 20:19:40 +0100 Subject: [PATCH 3/3] Clean .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 82f6c083..f6fec4ab 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,3 @@ workspace/ !workspace/AGENT.md !workspace/skills/skill-creator *.private* -workspace/conversations/* \ No newline at end of file