Skip to content

Commit 1233121

Browse files
authored
feat(speech): add prompt voice input (NeuralNomadsAI#249)
## Summary - add server-backed speech capabilities and transcription endpoints plus UI settings for speech configuration - add push-to-talk prompt voice input with microphone controls, transcription insertion, and browser capability gating - keep prompt controls aligned by restoring right-side nav placement and moving the mic beside the expand control
1 parent a950d47 commit 1233121

40 files changed

Lines changed: 1545 additions & 27 deletions

package-lock.json

Lines changed: 23 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/opencode-config/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44
"private": true,
55
"license": "MIT",
66
"dependencies": {
7-
"@opencode-ai/plugin": "1.2.24"
7+
"@opencode-ai/plugin": "1.2.14"
88
}
99
}

packages/server/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"fastify": "^4.28.1",
3333
"fuzzysort": "^2.0.4",
3434
"node-forge": "^1.3.3",
35+
"openai": "^6.27.0",
3536
"pino": "^9.4.0",
3637
"undici": "^6.19.8",
3738
"yaml": "^2.4.2",

packages/server/src/api-types.ts

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,36 @@ export interface BinaryValidationResult {
207207
error?: string
208208
}
209209

210+
export interface SpeechSegment {
211+
startMs: number
212+
endMs: number
213+
text: string
214+
}
215+
216+
export interface SpeechCapabilitiesResponse {
217+
available: boolean
218+
configured: boolean
219+
provider: string
220+
supportsStt: boolean
221+
supportsTts: boolean
222+
baseUrl?: string
223+
sttModel: string
224+
ttsModel: string
225+
ttsVoice: string
226+
}
227+
228+
export interface SpeechTranscriptionResponse {
229+
text: string
230+
language?: string
231+
durationMs?: number
232+
segments?: SpeechSegment[]
233+
}
234+
235+
export interface SpeechSynthesisResponse {
236+
audioBase64: string
237+
mimeType: string
238+
}
239+
210240
export type WorkspaceEventType =
211241
| "workspace.created"
212242
| "workspace.started"

packages/server/src/index.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import { AuthManager, BOOTSTRAP_TOKEN_STDOUT_PREFIX, DEFAULT_AUTH_USERNAME } fro
2323
import { resolveHttpsOptions } from "./server/tls"
2424
import { resolveNetworkAddresses } from "./server/network-addresses"
2525
import { startDevReleaseMonitor } from "./releases/dev-release-monitor"
26+
import { SpeechService } from "./speech/service"
2627

2728
const require = createRequire(import.meta.url)
2829

@@ -304,6 +305,7 @@ async function main() {
304305
})
305306
const fileSystemBrowser = new FileSystemBrowser({ rootDir: options.rootDir, unrestricted: options.unrestrictedRoot })
306307
const instanceStore = new InstanceStore(configLocation.instancesDir)
308+
const speechService = new SpeechService(settings, logger.child({ component: "speech" }))
307309
const instanceEventBridge = new InstanceEventBridge({
308310
workspaceManager,
309311
eventBus,
@@ -388,6 +390,7 @@ async function main() {
388390
eventBus,
389391
serverMeta,
390392
instanceStore,
393+
speechService,
391394
authManager,
392395
uiStaticDir: uiResolution.uiStaticDir ?? DEFAULT_UI_STATIC_DIR,
393396
uiDevServerUrl: uiResolution.uiDevServerUrl,
@@ -408,6 +411,7 @@ async function main() {
408411
eventBus,
409412
serverMeta,
410413
instanceStore,
414+
speechService,
411415
authManager,
412416
uiStaticDir: uiResolution.uiStaticDir ?? DEFAULT_UI_STATIC_DIR,
413417
uiDevServerUrl: undefined,

packages/server/src/server/http-server.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ import { registerStorageRoutes } from "./routes/storage"
2121
import { registerPluginRoutes } from "./routes/plugin"
2222
import { registerBackgroundProcessRoutes } from "./routes/background-processes"
2323
import { registerWorktreeRoutes } from "./routes/worktrees"
24+
import { registerSpeechRoutes } from "./routes/speech"
2425
import { ServerMeta } from "../api-types"
2526
import { InstanceStore } from "../storage/instance-store"
2627
import { BackgroundProcessManager } from "../background-processes/manager"
2728
import type { AuthManager } from "../auth/manager"
2829
import { registerAuthRoutes } from "./routes/auth"
2930
import { sendUnauthorized, wantsHtml } from "../auth/http-auth"
31+
import type { SpeechService } from "../speech/service"
3032

3133
interface HttpServerDeps {
3234
bindHost: string
@@ -41,6 +43,7 @@ interface HttpServerDeps {
4143
eventBus: EventBus
4244
serverMeta: ServerMeta
4345
instanceStore: InstanceStore
46+
speechService: SpeechService
4447
authManager: AuthManager
4548
uiStaticDir: string
4649
uiDevServerUrl?: string
@@ -252,6 +255,7 @@ export function createHttpServer(deps: HttpServerDeps) {
252255
eventBus: deps.eventBus,
253256
workspaceManager: deps.workspaceManager,
254257
})
258+
registerSpeechRoutes(app, { speechService: deps.speechService })
255259
registerPluginRoutes(app, { workspaceManager: deps.workspaceManager, eventBus: deps.eventBus, logger: proxyLogger })
256260
registerBackgroundProcessRoutes(app, { backgroundProcessManager })
257261
registerInstanceProxyRoutes(app, { workspaceManager: deps.workspaceManager, logger: proxyLogger })

packages/server/src/server/routes/settings.ts

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { z } from "zod"
33
import { probeBinaryVersion } from "../../workspaces/runtime"
44
import type { SettingsService } from "../../settings/service"
55
import type { Logger } from "../../logger"
6+
import { sanitizeConfigDoc, sanitizeConfigOwner } from "../../settings/public-config"
67

78
interface RouteDeps {
89
settings: SettingsService
@@ -20,23 +21,26 @@ function validateBinaryPath(binaryPath: string): { valid: boolean; version?: str
2021

2122
export function registerSettingsRoutes(app: FastifyInstance, deps: RouteDeps) {
2223
// Full-document access
23-
app.get("/api/storage/config", async () => deps.settings.getDoc("config"))
24+
app.get("/api/storage/config", async () => sanitizeConfigDoc(deps.settings.getDoc("config")))
2425
app.patch("/api/storage/config", async (request, reply) => {
2526
try {
26-
return deps.settings.mergePatchDoc("config", request.body ?? {})
27+
return sanitizeConfigDoc(deps.settings.mergePatchDoc("config", request.body ?? {}))
2728
} catch (error) {
2829
reply.code(400)
2930
return { error: error instanceof Error ? error.message : "Invalid patch" }
3031
}
3132
})
3233

3334
app.get<{ Params: { owner: string } }>("/api/storage/config/:owner", async (request) => {
34-
return deps.settings.getOwner("config", request.params.owner)
35+
return sanitizeConfigOwner(request.params.owner, deps.settings.getOwner("config", request.params.owner))
3536
})
3637

3738
app.patch<{ Params: { owner: string } }>("/api/storage/config/:owner", async (request, reply) => {
3839
try {
39-
return deps.settings.mergePatchOwner("config", request.params.owner, request.body ?? {})
40+
return sanitizeConfigOwner(
41+
request.params.owner,
42+
deps.settings.mergePatchOwner("config", request.params.owner, request.body ?? {}),
43+
)
4044
} catch (error) {
4145
reply.code(400)
4246
return { error: error instanceof Error ? error.message : "Invalid patch" }
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import type { FastifyInstance } from "fastify"
2+
import { z } from "zod"
3+
import type { SpeechService } from "../../speech/service"
4+
5+
interface RouteDeps {
6+
speechService: SpeechService
7+
}
8+
9+
const TranscribeBodySchema = z.object({
10+
audioBase64: z.string().min(1, "Audio payload is required"),
11+
mimeType: z.string().min(1, "Audio MIME type is required"),
12+
filename: z.string().optional(),
13+
language: z.string().optional(),
14+
prompt: z.string().optional(),
15+
})
16+
17+
const SynthesizeBodySchema = z.object({
18+
text: z.string().trim().min(1, "Text is required"),
19+
format: z.enum(["mp3", "wav", "opus"]).optional(),
20+
})
21+
22+
function getSpeechErrorStatus(error: unknown): number {
23+
if (error instanceof z.ZodError) {
24+
return 400
25+
}
26+
if (error instanceof Error && /not configured/i.test(error.message)) {
27+
return 503
28+
}
29+
return 502
30+
}
31+
32+
function getSpeechErrorMessage(error: unknown, fallback: string): string {
33+
return error instanceof Error ? error.message : fallback
34+
}
35+
36+
export function registerSpeechRoutes(app: FastifyInstance, deps: RouteDeps) {
37+
app.get("/api/speech/capabilities", async () => deps.speechService.getCapabilities())
38+
39+
app.post("/api/speech/transcribe", async (request, reply) => {
40+
try {
41+
const body = TranscribeBodySchema.parse(request.body ?? {})
42+
return await deps.speechService.transcribe(body)
43+
} catch (error) {
44+
request.log.error({ err: error }, "Failed to transcribe audio")
45+
reply.code(getSpeechErrorStatus(error))
46+
return { error: getSpeechErrorMessage(error, "Failed to transcribe audio") }
47+
}
48+
})
49+
50+
app.post("/api/speech/synthesize", async (request, reply) => {
51+
try {
52+
const body = SynthesizeBodySchema.parse(request.body ?? {})
53+
return await deps.speechService.synthesize(body)
54+
} catch (error) {
55+
request.log.error({ err: error }, "Failed to synthesize audio")
56+
reply.code(getSpeechErrorStatus(error))
57+
return { error: getSpeechErrorMessage(error, "Failed to synthesize audio") }
58+
}
59+
})
60+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import type { SettingsDoc } from "./yaml-doc-store"
2+
3+
function isPlainObject(value: unknown): value is Record<string, unknown> {
4+
return typeof value === "object" && value !== null && !Array.isArray(value)
5+
}
6+
7+
function sanitizeServerOwner(value: SettingsDoc): SettingsDoc {
8+
const next: SettingsDoc = { ...value }
9+
const speech = isPlainObject(next.speech) ? { ...next.speech } : null
10+
11+
if (!speech) {
12+
return next
13+
}
14+
15+
const rawApiKey = typeof speech.apiKey === "string" ? speech.apiKey.trim() : ""
16+
if (rawApiKey) {
17+
delete speech.apiKey
18+
speech.hasApiKey = true
19+
} else if (!("hasApiKey" in speech)) {
20+
speech.hasApiKey = false
21+
}
22+
23+
next.speech = speech
24+
return next
25+
}
26+
27+
export function sanitizeConfigOwner(owner: string, value: SettingsDoc): SettingsDoc {
28+
if (owner !== "server") {
29+
return value
30+
}
31+
return sanitizeServerOwner(value)
32+
}
33+
34+
export function sanitizeConfigDoc(value: SettingsDoc): SettingsDoc {
35+
const next: SettingsDoc = { ...value }
36+
if (isPlainObject(next.server)) {
37+
next.server = sanitizeServerOwner(next.server)
38+
}
39+
return next
40+
}

packages/server/src/settings/service.ts

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import type { ConfigLocation } from "../config/location"
44
import { YamlDocStore, type SettingsDoc } from "./yaml-doc-store"
55
import { migrateSettingsLayout } from "./migrate"
66
import type { WorkspaceEventPayload } from "../api-types"
7+
import { sanitizeConfigOwner } from "./public-config"
78

89
export type DocKind = "config" | "state"
910

@@ -45,10 +46,11 @@ export class SettingsService {
4546
private publish(kind: DocKind, owner: string, value?: SettingsDoc) {
4647
if (!this.eventBus) return
4748
const type = kind === "config" ? "storage.configChanged" : "storage.stateChanged"
49+
const nextValue = value ?? this.getOwner(kind, owner)
4850
const payload: WorkspaceEventPayload = {
4951
type,
5052
owner,
51-
value: value ?? this.getOwner(kind, owner),
53+
value: kind === "config" ? sanitizeConfigOwner(owner, nextValue) : nextValue,
5254
} as any
5355
this.eventBus.publish(payload)
5456
}

0 commit comments

Comments
 (0)