Skip to content

Commit eaa4ea9

Browse files
committed
Add support for Voice Input
1 parent b366008 commit eaa4ea9

8 files changed

Lines changed: 534 additions & 120 deletions

File tree

llms/extensions/providers/mistral.py

Lines changed: 49 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ class MistralTranscriptionGenerator(GeneratorBase):
4949
def __init__(self, **kwargs):
5050
super().__init__(**kwargs)
5151

52-
async def chat(self, chat, provider=None, context=None):
53-
headers = self.get_headers(provider, chat)
52+
async def transcribe(self, file_bytes, filename, model=None, headers=None, context=None):
53+
model = model or "voxtral-mini-latest"
54+
headers = headers or self.get_headers()
5455
# Remove Content-Type to allow aiohttp to set it for FormData
5556
if "Content-Type" in headers:
5657
del headers["Content-Type"]
@@ -60,6 +61,35 @@ async def chat(self, chat, provider=None, context=None):
6061
token = headers["Authorization"].replace("Bearer ", "")
6162
headers["x-api-key"] = token
6263

64+
# Prepare FormData
65+
data = aiohttp.FormData()
66+
data.add_field("model", model)
67+
data.add_field(
68+
"file", file_bytes, filename=filename, content_type=mimetypes.guess_type(filename)[0] or "audio/mpeg"
69+
)
70+
71+
ctx.log(f"POST {self.api_url} model={model} file={filename} ({len(file_bytes)} bytes)")
72+
73+
async with aiohttp.ClientSession() as session, session.post(
74+
self.api_url, headers=headers, data=data
75+
) as response:
76+
text = await response.text()
77+
if response.status != 200:
78+
raise Exception(f"Mistral API Error {response.status}: {text}")
79+
80+
if context:
81+
context["providerResponse"] = text
82+
83+
try:
84+
result = json.loads(text)
85+
except Exception:
86+
result = {"text": text} # Fallback
87+
88+
return result
89+
90+
async def chat(self, chat, provider=None, context=None):
91+
headers = self.get_headers(provider, chat)
92+
6393
model = provider.provider_model(chat["model"]) or chat["model"] or "voxtral-mini-latest"
6494
# Replace internal alias with actual model name
6595
if model == "voxtral-mini-transcription":
@@ -110,50 +140,28 @@ async def chat(self, chat, provider=None, context=None):
110140
except Exception as e:
111141
raise Exception(f"Failed to decode audio data: {e}") from e
112142

113-
# Prepare FormData
114-
data = aiohttp.FormData()
115-
data.add_field("model", model)
116-
data.add_field(
117-
"file", file_bytes, filename=filename, content_type=mimetypes.guess_type(filename)[0] or "audio/mpeg"
118-
)
119-
120-
ctx.log(f"POST {self.api_url} model={model} file={filename} ({len(file_bytes)} bytes)")
143+
result = await self.transcribe(file_bytes, filename, model=model, headers=headers, context=context)
144+
transcription = result.get("text", "")
121145

122-
async with aiohttp.ClientSession() as session, session.post(
123-
self.api_url, headers=headers, data=data
124-
) as response:
125-
text = await response.text()
126-
if response.status != 200:
127-
raise Exception(f"Mistral API Error {response.status}: {text}")
128-
129-
context["providerResponse"] = text
130-
131-
try:
132-
result = json.loads(text)
133-
except Exception:
134-
result = {"text": text} # Fallback
135-
136-
transcription = result.get("text", "")
137-
138-
ret = {
139-
"choices": [
140-
{
141-
"message": {
142-
"role": "assistant",
143-
"content": transcription,
144-
}
146+
ret = {
147+
"choices": [
148+
{
149+
"message": {
150+
"role": "assistant",
151+
"content": transcription,
145152
}
146-
],
147-
"created": result.get("created", int(time.time())),
148-
}
153+
}
154+
],
155+
"created": result.get("created", int(time.time())),
156+
}
149157

150-
if "model" in result:
151-
ret["model"] = result["model"]
158+
if "model" in result:
159+
ret["model"] = result["model"]
152160

153-
if "usage" in result:
154-
ret["usage"] = result["usage"]
161+
if "usage" in result:
162+
ret["usage"] = result["usage"]
155163

156-
return ret
164+
return ret
157165

158166
class MistralProvider(OpenAiCompatible):
159167
sdk = "@ai-sdk/mistral"

llms/extensions/voice/README.md

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# Voice Input Extension
2+
3+
Adds voice-to-text transcription to the chat interface via a microphone button or keyboard shortcut.
4+
5+
## Configuration
6+
7+
Set the `LLMS_VOICE` environment variable to configure which transcription modes are available and in what priority order:
8+
9+
```bash
10+
export LLMS_VOICE="voxtype,transcribe,voxtral-mini-latest"
11+
```
12+
13+
The extension tries each mode in order and uses the first one that's available. The default order is `voxtype,transcribe,voxtral-mini-latest`.
14+
15+
## Available Modes
16+
17+
### voxtype
18+
19+
Uses the [voxtype.io](https://voxtype.io) CLI tool for local transcription.
20+
21+
**Requirements:**
22+
- `voxtype` must be installed and on your `$PATH`
23+
- `ffmpeg` must be installed for audio format conversion
24+
25+
### transcribe
26+
27+
Uses a custom `transcribe` executable for flexible local transcription. This lets you integrate any speech-to-text tool.
28+
29+
**Requirements:**
30+
- A `transcribe` executable on your `$PATH` that accepts an audio wav file and outputs text to stdout
31+
- `ffmpeg` must be installed for audio format conversion
32+
33+
**Interface:**
34+
```bash
35+
transcribe recording.wav > transcript.txt
36+
```
37+
38+
See [Creating a transcribe Script](#creating-a-transcribe-script) for implementation examples.
39+
40+
### voxtral-mini-latest
41+
42+
Uses [Mistral's Voxtral model](https://docs.mistral.ai/models/voxtral-mini-transcribe-26-02) for cloud-based transcription.
43+
44+
**Requirements:**
45+
- Mistral provider must be enabled in your configuration
46+
- `MISTRAL_API_KEY` environment variable must be set
47+
48+
**Pricing:** ~$0.003/minute
49+
50+
## Usage
51+
52+
### Microphone Button
53+
54+
Click the microphone icon in the chat input area to start recording. Click again to stop and transcribe.
55+
56+
### Keyboard Shortcut
57+
58+
**Alt+D** toggles voice recording with two modes:
59+
60+
- **Tap (< 500ms):** Toggle mode — starts recording, press again to stop
61+
- **Hold (≥ 500ms):** Push-to-talk — records while held, stops when released
62+
63+
The transcribed text is appended to the current message input.
64+
65+
---
66+
67+
## Creating a transcribe Script
68+
69+
### Using OpenAI Whisper
70+
71+
Create a script using [uvx](https://github.com/astral-sh/uv) and [openai-whisper](https://github.com/openai/whisper):
72+
73+
```bash
74+
#!/usr/bin/env bash
75+
uvx --from openai-whisper whisper "$1" --model base.en --output_format txt --output_dir /tmp >/dev/null 2>&1
76+
77+
BASENAME=$(basename "${1%.*}")
78+
cat "/tmp/${BASENAME}.txt"
79+
rm -f "/tmp/${BASENAME}.txt"
80+
```
81+
82+
### Using Whisper.cpp
83+
84+
[whisper.cpp](https://github.com/ggml-org/whisper.cpp) provides a faster, dependency-free C++ implementation.
85+
86+
**Setup:**
87+
88+
```bash
89+
git clone https://github.com/ggml-org/whisper.cpp.git
90+
cd whisper.cpp
91+
92+
# Download a model
93+
sh ./models/download-ggml-model.sh base.en
94+
95+
# Build
96+
cmake -B build
97+
cmake --build build -j --config Release
98+
99+
# Test
100+
./build/bin/whisper-cli -f samples/jfk.wav
101+
```
102+
103+
**Create the transcribe script:**
104+
105+
```bash
106+
#!/usr/bin/env bash
107+
SCRIPT_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
108+
MODEL="$SCRIPT_DIR/models/ggml-base.en.bin"
109+
CLI="$SCRIPT_DIR/build/bin/whisper-cli"
110+
TMPFILE=$(mktemp /tmp/whisper-XXXXXX)
111+
112+
trap 'rm -f "$TMPFILE" "${TMPFILE}.txt"' EXIT
113+
114+
"$CLI" -m "$MODEL" -otxt -f "$1" -of "$TMPFILE" >/dev/null 2>&1
115+
116+
cat "${TMPFILE}.txt"
117+
```
118+
119+
### Installation
120+
121+
Make the script executable and add it to your `$PATH`:
122+
123+
```bash
124+
chmod +x ./transcribe
125+
sudo ln -s $(pwd)/transcribe /usr/local/bin/transcribe
126+
```

llms/extensions/voice/__init__.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
import os
2+
import re
3+
import shutil
4+
5+
from aiohttp import web
6+
7+
LLMS_VOICE = os.getenv("LLMS_VOICE", "voxtype,transcribe,voxtral-mini-latest")
8+
9+
10+
def install(ctx):
11+
voice_options = LLMS_VOICE.split(",")
12+
mode = None
13+
14+
for opt in voice_options:
15+
if opt == "voxtype":
16+
if not shutil.which("voxtype"):
17+
ctx.dbg(f"Cannot use {opt} - voxtype not installed")
18+
else:
19+
mode = opt
20+
break
21+
if opt == "transcribe":
22+
if not shutil.which("transcribe"):
23+
ctx.dbg(f"Cannot use {opt} - transcribe not installed")
24+
else:
25+
mode = opt
26+
break
27+
elif opt.startswith("voxtral"):
28+
mistral = ctx.config.get("providers", {}).get("mistral")
29+
if not mistral or not mistral.get("enabled") or not os.getenv("MISTRAL_API_KEY"):
30+
ctx.dbg(f"Cannot use {opt} - Mistral not enabled")
31+
else:
32+
mode = opt
33+
break
34+
35+
if (mode == "transcribe" or mode == "voxtype") and not shutil.which("ffmpeg"):
36+
ctx.dbg(f"Cannot use {mode} - ffmpeg not installed")
37+
mode = None
38+
39+
if not mode:
40+
ctx.disabled = True
41+
return
42+
43+
ctx.log(f"Using {mode} for voice")
44+
45+
async def transcribe_audio(request):
46+
"""
47+
Transcribe audio using Voxtral
48+
POST /transcribe
49+
"""
50+
# Get audio data from request
51+
data = await request.post()
52+
audio_file = data.get("file")
53+
54+
if not audio_file:
55+
raise Exception("No audio file provided")
56+
57+
# Read audio data
58+
audio_bytes = audio_file.file.read()
59+
60+
if mode == "voxtral-mini-latest":
61+
mistral = ctx.get_provider("mistral")
62+
result = await mistral.transcription.transcribe(audio_bytes, audio_file.filename)
63+
result["mode"] = mode
64+
return web.json_response(result)
65+
66+
# Save to temporary file for voxtype
67+
import tempfile
68+
from pathlib import Path
69+
70+
suffix = Path(audio_file.filename).suffix
71+
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as temp_input:
72+
temp_input.write(audio_bytes)
73+
temp_input_path = temp_input.name
74+
75+
# Convert to 16kHz WAV using ffmpeg
76+
temp_wav_path = temp_input_path + ".wav"
77+
78+
try:
79+
ctx.run_command(
80+
["ffmpeg", "-i", temp_input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", temp_wav_path, "-y"]
81+
)
82+
83+
if mode == "transcribe":
84+
result = ctx.run_command(["transcribe", temp_wav_path])
85+
86+
if result.returncode != 0:
87+
raise Exception(result.stderr)
88+
89+
text = result.stdout.decode("utf-8").strip()
90+
return web.json_response({"text": text, "mode": mode})
91+
92+
# Run voxtype to transcribe
93+
result = ctx.run_command(["voxtype", "transcribe", temp_wav_path])
94+
95+
ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
96+
97+
# Extract transcription - take the last non-empty line that isn't a log
98+
output_lines = []
99+
for line in result.stdout.decode("utf-8").strip().split("\n"):
100+
clean_line = ansi_escape.sub("", line).strip()
101+
if clean_line and not clean_line.startswith("[") and "INFO" not in clean_line:
102+
output_lines.append(clean_line)
103+
104+
transcription = output_lines[-1] if output_lines else ""
105+
106+
finally:
107+
# Clean up
108+
if os.path.exists(temp_input_path):
109+
os.remove(temp_input_path)
110+
if os.path.exists(temp_wav_path):
111+
os.remove(temp_wav_path)
112+
113+
return web.json_response({"text": transcription, "mode": mode})
114+
115+
ctx.add_post("/transcribe", transcribe_audio)
116+
117+
118+
__install__ = install

0 commit comments

Comments
 (0)