diff --git a/.changeset/gemini-tts-31-flash.md b/.changeset/gemini-tts-31-flash.md new file mode 100644 index 000000000..a62b4714f --- /dev/null +++ b/.changeset/gemini-tts-31-flash.md @@ -0,0 +1,5 @@ +--- +'@livekit/agents-plugin-google': patch +--- + +Update Gemini TTS to default to Gemini 3.1 Flash TTS preview and stream generated audio chunks. diff --git a/examples/src/google_gemini_tts.ts b/examples/src/google_gemini_tts.ts new file mode 100644 index 000000000..889110b78 --- /dev/null +++ b/examples/src/google_gemini_tts.ts @@ -0,0 +1,42 @@ +// SPDX-FileCopyrightText: 2026 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { type JobContext, ServerOptions, cli, defineAgent, voice } from '@livekit/agents'; +import * as deepgram from '@livekit/agents-plugin-deepgram'; +import * as google from '@livekit/agents-plugin-google'; +import { BackgroundVoiceCancellation } from '@livekit/noise-cancellation-node'; +import { fileURLToPath } from 'node:url'; + +class GeminiTTSAgent extends voice.Agent { + async onEnter() { + this.session.generateReply({ instructions: 'greet the user and introduce yourself' }); + } +} + +export default defineAgent({ + entry: async (ctx: JobContext) => { + const agent = new GeminiTTSAgent({ + instructions: 'Your name is Kelly. Respond briefly and concisely using voice conversation.', + }); + + const session = new voice.AgentSession({ + stt: new deepgram.STT(), + llm: new google.LLM({ model: 'gemini-2.5-flash' }), + tts: new google.beta.TTS({ + apiKey: process.env.GOOGLE_API_KEY, + voiceName: 'Kore', + model: 'gemini-3.1-flash-tts-preview', + }), + }); + + await session.start({ + agent, + room: ctx.room, + inputOptions: { + noiseCancellation: BackgroundVoiceCancellation(), + }, + }); + }, +}); + +cli.runApp(new ServerOptions({ agent: fileURLToPath(import.meta.url) })); diff --git a/plugins/google/src/beta/gemini_tts.test.ts b/plugins/google/src/beta/gemini_tts.test.ts index 01401897e..52354bd78 100644 --- a/plugins/google/src/beta/gemini_tts.test.ts +++ b/plugins/google/src/beta/gemini_tts.test.ts @@ -3,9 +3,65 @@ // SPDX-License-Identifier: Apache-2.0 import { STT } from '@livekit/agents-plugin-openai'; import { tts } from '@livekit/agents-plugins-test'; -import { describe } from 'vitest'; +import { describe, expect, it, vi } from 'vitest'; import { TTS } from './gemini_tts.js'; -describe.skip('Google Gemini TTS', async () => { - await tts(new TTS(), new STT()); +const { generateContentStream } = vi.hoisted(() => ({ + generateContentStream: vi.fn(), +})); + +vi.mock('@google/genai', () => ({ + GoogleGenAI: vi.fn(function GoogleGenAI() { + return { + models: { + generateContentStream, + }, + }; + }), +})); + +describe('Google Gemini TTS integration', () => { + it.skip('synthesizes with live providers', async () => { + await tts(new TTS(), new STT()); + }); }); + +describe('Google Gemini TTS', () => { + it('synthesizes audio from a streamed Gemini response', async () => { + const audioChunk = Buffer.alloc(4800); + + generateContentStream.mockImplementation(async function* () { + yield buildResponseChunk(audioChunk); + yield buildResponseChunk(audioChunk); + }); + + const stream = new TTS({ apiKey: 'test-api-key' }).synthesize('Hello world'); + let audioCount = 0; + + for await (const _frame of stream) { + audioCount += 1; + } + + expect(generateContentStream).toHaveBeenCalledOnce(); + expect(audioCount).toBeGreaterThan(0); + }); +}); + +function buildResponseChunk(data: Buffer) { + return { + candidates: [ + { + content: { + parts: [ + { + inlineData: { + data: data.toString('base64'), + mimeType: 'audio/pcm', + }, + }, + ], + }, + }, + ], + }; +} diff --git a/plugins/google/src/beta/gemini_tts.ts b/plugins/google/src/beta/gemini_tts.ts index 44a06e02b..0d778f0c0 100644 --- a/plugins/google/src/beta/gemini_tts.ts +++ b/plugins/google/src/beta/gemini_tts.ts @@ -49,7 +49,7 @@ export type GeminiVoices = | 'Sadaltager' | 'Sulafat'; -const DEFAULT_MODEL: GeminiTTSModels = 'gemini-2.5-flash-lite-preview-tts'; +const DEFAULT_MODEL: GeminiTTSModels = 'gemini-3.1-flash-tts-preview'; const DEFAULT_VOICE: GeminiVoices = 'Kore'; const DEFAULT_SAMPLE_RATE = 24000; // not configurable const NUM_CHANNELS = 1; @@ -234,6 +234,19 @@ export class ChunkedStream extends tts.ChunkedStream { ]; try { + let lastFrame: AudioFrame | undefined; + const sendLastFrame = (final: boolean) => { + if (lastFrame) { + this.queue.put({ + requestId, + frame: lastFrame, + segmentId: requestId, + final, + }); + lastFrame = undefined; + } + }; + const responseStream = await this.#tts.client.models.generateContentStream({ model: this.#tts.opts.model, contents, @@ -241,8 +254,18 @@ export class ChunkedStream extends tts.ChunkedStream { }); for await (const response of responseStream) { - await this.#processResponse(response, bstream, requestId); + await this.#processResponse(response, bstream, (frame) => { + sendLastFrame(false); + lastFrame = frame; + }); } + + for (const frame of bstream.flush()) { + sendLastFrame(false); + lastFrame = frame; + } + + sendLastFrame(true); } catch (error: unknown) { if (error instanceof Error && error.name === 'AbortError') { return; @@ -298,7 +321,7 @@ export class ChunkedStream extends tts.ChunkedStream { async #processResponse( response: types.GenerateContentResponse, bstream: AudioByteStream, - requestId: string, + onFrame: (frame: AudioFrame) => void, ) { if (!response.candidates || response.candidates.length === 0) { return; @@ -309,36 +332,15 @@ export class ChunkedStream extends tts.ChunkedStream { return; } - let lastFrame: AudioFrame | undefined; - const sendLastFrame = (final: boolean) => { - if (lastFrame) { - this.queue.put({ - requestId, - frame: lastFrame, - segmentId: requestId, - final, - }); - lastFrame = undefined; - } - }; - for (const part of candidate.content.parts) { if (part.inlineData?.data && part.inlineData.mimeType?.startsWith('audio/')) { const audioBuffer = Buffer.from(part.inlineData.data, 'base64'); for (const frame of bstream.write(audioBuffer)) { - sendLastFrame(false); - lastFrame = frame; + onFrame(frame); } } } - - for (const frame of bstream.flush()) { - sendLastFrame(false); - lastFrame = frame; - } - - sendLastFrame(true); } }