Skip to content

Commit 214dfe8

Browse files
zcoderrclaude
andcommitted
feat: add Gemini Live real-time voice chat via WebSocket relay
Add backend WebSocket relay that bridges iOS client to Gemini Live API for real-time voice conversation, replacing the turn-based STT→Chat→TTS voice flow with a single bidirectional audio stream. Backend: - POST /api/voice/session: creates session with JWT containing system instruction (Susan Miller voice persona + user astrology context) - WebSocket /api/voice/stream: authenticates via JWT, relays PCM audio between client and Gemini Live API - Voice-optimized prompt (no emoji, short conversational sentences) - MessageType.VoiceChat = 25 for tracking voice sessions Co-authored-by: Claude <noreply@anthropic.com>
1 parent 6fcbf5f commit 214dfe8

12 files changed

Lines changed: 1537 additions & 36 deletions

File tree

api/src/ai/voice-model.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import Config from '@blocklet/sdk/lib/config';
2+
3+
// ---------------------------------------------------------------------------
4+
// Gemini Live API configuration
5+
// Uses direct Google WebSocket API, not AIGNE Hub.
6+
// ---------------------------------------------------------------------------
7+
8+
const DEFAULT_LIVE_MODEL = 'gemini-3.1-flash-live-preview';
9+
const DEFAULT_VOICE = 'Aoede';
10+
11+
export function getGeminiApiKey(): string {
12+
const key =
13+
(Config.env.preferences?.gemini_api_key as string) ||
14+
process.env.GEMINI_API_KEY ||
15+
process.env.GOOGLE_API_KEY ||
16+
'';
17+
return key;
18+
}
19+
20+
export function getGeminiLiveModel(): string {
21+
return (Config.env.preferences?.gemini_live_model as string) || process.env.GEMINI_LIVE_MODEL || DEFAULT_LIVE_MODEL;
22+
}
23+
24+
export function getGeminiLiveVoice(): string {
25+
return (Config.env.preferences?.gemini_live_voice as string) || process.env.GEMINI_LIVE_VOICE || DEFAULT_VOICE;
26+
}
27+
28+
export function getGeminiLiveWsUrl(): string {
29+
const apiKey = getGeminiApiKey();
30+
return (
31+
'wss://generativelanguage.googleapis.com/ws/' +
32+
'google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent' +
33+
`?key=${apiKey}`
34+
);
35+
}
36+
37+
/** Maximum voice session duration in seconds (default 10 minutes) */
38+
export function getVoiceMaxDuration(): number {
39+
return Number(Config.env.preferences?.voice_max_duration || process.env.VOICE_MAX_DURATION || 600);
40+
}
41+
42+
/** Voice session token TTL in seconds (default 30 minutes) */
43+
export function getVoiceSessionTTL(): number {
44+
return Number(Config.env.preferences?.voice_session_ttl || process.env.VOICE_SESSION_TTL || 1800);
45+
}

api/src/index.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import express, { ErrorRequestHandler } from 'express';
66
import 'express-async-errors';
77
import Joi from 'joi';
88
import path from 'path';
9+
import { WebSocketServer } from 'ws';
910

1011
import { HttpError } from './libs/auth';
1112
import startBlogCron from './libs/cron-blog';
@@ -19,6 +20,7 @@ import aigneCompat from './routes/aigne-compat';
1920
// Mount daily routes at /daily/api to match the rewritten path
2021
import dailyRouter from './routes/daily';
2122
import routesV2 from './routes/v2';
23+
import { handleVoiceWebSocket } from './routes/voice/ws';
2224

2325
dotenv.config();
2426

@@ -79,6 +81,11 @@ export const server = app.listen(port, (err?: any) => {
7981
if (err) throw err;
8082
logger.info(`> ${name} v${version} ready on ${port}`);
8183

84+
// Mount WebSocket server for voice relay
85+
const wss = new WebSocketServer({ server, path: '/api/voice/stream' });
86+
wss.on('connection', handleVoiceWebSocket);
87+
logger.info('> Voice WebSocket server mounted on /api/voice/stream');
88+
8289
startBlogCron();
8390
startNotificationCron();
8491
startScribeCron();

api/src/routes/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@ import setting from './setting';
1515
import sns from './sns';
1616
import upload from './upload';
1717
import user from './user';
18+
import voice from './voice';
1819

1920
const router = Router();
2021

2122
router.use('/auth', auth);
2223
router.use('/user', user);
2324
router.use('/ai', ai);
25+
router.use('/voice', voice);
2426
router.use('/setting', setting);
2527
router.use('/upload', upload);
2628
router.use('/message', message);

api/src/routes/voice/index.ts

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import { user } from '@blocklet/sdk/lib/middlewares/user';
2+
import { Router } from 'express';
3+
import { sign } from 'jsonwebtoken';
4+
5+
import { getGeminiApiKey, getGeminiLiveVoice, getVoiceSessionTTL } from '../../ai/voice-model';
6+
import { UnauthorizedError } from '../../libs/auth';
7+
import { config } from '../../libs/env';
8+
import { getLanguage } from '../../libs/language';
9+
import logger from '../../libs/logger';
10+
import Message, { MessageRole, MessageType } from '../../store/models/message';
11+
import { buildVoiceSystemInstruction } from './system-prompt';
12+
13+
const router = Router();
14+
15+
// JWT secret for voice session tokens — reuse the VOICE_SESSION_SECRET env var
16+
// or fall back to a derived key from the Gemini API key
17+
function getSessionSecret(): string {
18+
return process.env.VOICE_SESSION_SECRET || `voice-session:${getGeminiApiKey()}`;
19+
}
20+
21+
// ---------------------------------------------------------------------------
22+
// POST /api/voice/session — Create a voice session
23+
// Returns a signed JWT containing the system instruction for the WebSocket relay.
24+
// ---------------------------------------------------------------------------
25+
26+
router.post('/session', user(), async (req, res) => {
27+
const { did: userId } = req.user ?? {};
28+
if (!userId) throw new UnauthorizedError();
29+
30+
const language = getLanguage(req);
31+
32+
// Check that Gemini API key is configured
33+
const apiKey = getGeminiApiKey();
34+
if (!apiKey) {
35+
res.status(503).json({ error: { message: 'Voice feature is not configured' } });
36+
return;
37+
}
38+
39+
// Rate limit check — voice sessions share the same daily quota as SessionChat
40+
const [remainingCount] = await Message.remainingQueryCount({ userId });
41+
if (typeof remainingCount === 'number' && remainingCount <= 0) {
42+
if (config.limitation.countMessageType.includes(MessageType.VoiceChat)) {
43+
res.status(429).json({ error: { message: 'Daily query limit reached' } });
44+
return;
45+
}
46+
}
47+
48+
// Build the voice system instruction with full user context
49+
const systemInstruction = await buildVoiceSystemInstruction({ userId, language });
50+
const voiceName = getGeminiLiveVoice();
51+
const ttl = getVoiceSessionTTL();
52+
53+
// Create a message record for this voice session
54+
await Message.bulkCreate([
55+
{ userId, type: MessageType.VoiceChat, role: MessageRole.user, content: '' },
56+
{ userId, type: MessageType.VoiceChat, role: MessageRole.ai, content: '' },
57+
]);
58+
59+
// Sign a session token
60+
const sessionToken = sign({ userId, systemInstruction, voiceName, language }, getSessionSecret(), { expiresIn: ttl });
61+
62+
logger.info('[voice] session created', { userId, language, voiceName, ttl });
63+
64+
res.json({
65+
sessionToken,
66+
expiresAt: Math.floor(Date.now() / 1000) + ttl,
67+
});
68+
});
69+
70+
export { getSessionSecret };
71+
export default router;

api/src/routes/voice/relay.ts

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
import WebSocket from 'ws';
2+
3+
import { getGeminiLiveModel, getGeminiLiveWsUrl, getVoiceMaxDuration } from '../../ai/voice-model';
4+
import logger from '../../libs/logger';
5+
6+
interface VoiceSession {
7+
userId: string;
8+
systemInstruction: string;
9+
voiceName: string;
10+
language: string;
11+
}
12+
13+
/**
14+
* GeminiLiveRelay — bridges a client WebSocket to the Gemini Live API.
15+
*
16+
* Protocol (client ↔ backend):
17+
* Client sends: { type: "audio", data: "<base64 PCM 16kHz>" }
18+
* { type: "end" }
19+
* Server sends: { type: "ready" }
20+
* { type: "audio", data: "<base64 PCM 24kHz>" }
21+
* { type: "turnComplete" }
22+
* { type: "error", message: "..." }
23+
* { type: "closed" }
24+
*/
25+
export class GeminiLiveRelay {
26+
private geminiWs: WebSocket | null = null;
27+
28+
private isSetupComplete = false;
29+
30+
private isClosed = false;
31+
32+
private maxDurationTimer: ReturnType<typeof setTimeout> | null = null;
33+
34+
constructor(
35+
private clientWs: WebSocket,
36+
private session: VoiceSession,
37+
) {}
38+
39+
/** Start the relay: connect to Gemini and wire up message handlers */
40+
start(): void {
41+
const wsUrl = getGeminiLiveWsUrl();
42+
if (!wsUrl || wsUrl.endsWith('key=')) {
43+
this.sendToClient({ type: 'error', message: 'Gemini API key not configured' });
44+
this.close();
45+
return;
46+
}
47+
48+
logger.info('[voice-relay] connecting to Gemini Live', { userId: this.session.userId });
49+
50+
this.geminiWs = new WebSocket(wsUrl);
51+
52+
this.geminiWs.on('open', () => {
53+
logger.info('[voice-relay] Gemini WS connected, sending setup');
54+
this.sendSetup();
55+
});
56+
57+
this.geminiWs.on('message', (data: WebSocket.Data) => {
58+
this.handleGeminiMessage(data);
59+
});
60+
61+
this.geminiWs.on('error', (error) => {
62+
logger.error('[voice-relay] Gemini WS error', error);
63+
this.sendToClient({ type: 'error', message: 'Connection to AI service failed' });
64+
this.close();
65+
});
66+
67+
this.geminiWs.on('close', (code, reason) => {
68+
logger.info('[voice-relay] Gemini WS closed', { code, reason: reason.toString() });
69+
if (!this.isClosed) {
70+
this.sendToClient({ type: 'closed' });
71+
this.close();
72+
}
73+
});
74+
75+
// Client message handler
76+
this.clientWs.on('message', (data: WebSocket.Data) => {
77+
this.handleClientMessage(data);
78+
});
79+
80+
this.clientWs.on('close', () => {
81+
logger.info('[voice-relay] client disconnected', { userId: this.session.userId });
82+
this.close();
83+
});
84+
85+
this.clientWs.on('error', (error) => {
86+
logger.error('[voice-relay] client WS error', error);
87+
this.close();
88+
});
89+
90+
// Max duration safety timer
91+
const maxDuration = getVoiceMaxDuration();
92+
this.maxDurationTimer = setTimeout(() => {
93+
logger.info('[voice-relay] max duration reached', { maxDuration, userId: this.session.userId });
94+
this.sendToClient({ type: 'closed' });
95+
this.close();
96+
}, maxDuration * 1000);
97+
}
98+
99+
/** Send the Gemini Live setup message with model config + system instruction */
100+
private sendSetup(): void {
101+
const model = getGeminiLiveModel();
102+
const setup = {
103+
setup: {
104+
model: `models/${model}`,
105+
generationConfig: {
106+
responseModalities: ['AUDIO'],
107+
speechConfig: {
108+
voiceConfig: {
109+
prebuiltVoiceConfig: { voiceName: this.session.voiceName },
110+
},
111+
},
112+
},
113+
systemInstruction: {
114+
parts: [{ text: this.session.systemInstruction }],
115+
},
116+
},
117+
};
118+
this.geminiWs?.send(JSON.stringify(setup));
119+
}
120+
121+
/** Handle messages from the client */
122+
private handleClientMessage(raw: WebSocket.Data): void {
123+
try {
124+
const msg = JSON.parse(raw.toString());
125+
126+
if (msg.type === 'audio' && this.isSetupComplete) {
127+
// Forward audio to Gemini in realtimeInput format
128+
this.geminiWs?.send(
129+
JSON.stringify({
130+
realtimeInput: {
131+
audio: {
132+
data: msg.data,
133+
mimeType: 'audio/pcm;rate=16000',
134+
},
135+
},
136+
}),
137+
);
138+
} else if (msg.type === 'end') {
139+
this.close();
140+
}
141+
} catch {
142+
// Ignore malformed messages
143+
}
144+
}
145+
146+
/** Handle messages from Gemini Live API */
147+
private handleGeminiMessage(raw: WebSocket.Data): void {
148+
try {
149+
const msg = JSON.parse(raw.toString());
150+
151+
// Setup complete
152+
if (msg.setupComplete !== undefined) {
153+
this.isSetupComplete = true;
154+
logger.info('[voice-relay] setup complete', { userId: this.session.userId });
155+
this.sendToClient({ type: 'ready' });
156+
return;
157+
}
158+
159+
const sc = msg.serverContent;
160+
if (!sc) return;
161+
162+
// Forward audio data
163+
if (sc.modelTurn?.parts) {
164+
for (const part of sc.modelTurn.parts) {
165+
const inline = part.inlineData;
166+
if (inline?.data) {
167+
this.sendToClient({ type: 'audio', data: inline.data });
168+
}
169+
}
170+
}
171+
172+
// Turn complete
173+
if (sc.turnComplete) {
174+
this.sendToClient({ type: 'turnComplete' });
175+
}
176+
} catch {
177+
// Ignore parse errors
178+
}
179+
}
180+
181+
/** Send a JSON message to the client WebSocket */
182+
private sendToClient(msg: object): void {
183+
if (this.clientWs.readyState === WebSocket.OPEN) {
184+
this.clientWs.send(JSON.stringify(msg));
185+
}
186+
}
187+
188+
/** Clean up both connections */
189+
close(): void {
190+
if (this.isClosed) return;
191+
this.isClosed = true;
192+
193+
if (this.maxDurationTimer) {
194+
clearTimeout(this.maxDurationTimer);
195+
this.maxDurationTimer = null;
196+
}
197+
198+
if (this.geminiWs && this.geminiWs.readyState !== WebSocket.CLOSED) {
199+
this.geminiWs.close();
200+
}
201+
this.geminiWs = null;
202+
203+
if (this.clientWs.readyState !== WebSocket.CLOSED) {
204+
this.clientWs.close();
205+
}
206+
}
207+
}

0 commit comments

Comments
 (0)