ServiceStack
diff --git a/‎llms/extensions/providers/mistral.py‎
Lines changed: 49 additions & 41 deletions b/‎llms/extensions/providers/mistral.py‎
Lines changed: 49 additions & 41 deletions
diff --git a/‎llms/extensions/voice/README.md‎
Lines changed: 126 additions & 0 deletions b/‎llms/extensions/voice/README.md‎
Lines changed: 126 additions & 0 deletions
diff --git a/‎llms/extensions/voice/__init__.py‎
Lines changed: 118 additions & 0 deletions b/‎llms/extensions/voice/__init__.py‎
Lines changed: 118 additions & 0 deletions
@@ -49,8 +49,9 @@ class MistralTranscriptionGenerator(GeneratorBase):
         def __init__(self, **kwargs):
             super().__init__(**kwargs)
 
-        async def chat(self, chat, provider=None, context=None):
-            headers = self.get_headers(provider, chat)
+        async def transcribe(self, file_bytes, filename, model=None, headers=None, context=None):
+            model = model or "voxtral-mini-latest"
+            headers = headers or self.get_headers()
             # Remove Content-Type to allow aiohttp to set it for FormData
             if "Content-Type" in headers:
                 del headers["Content-Type"]
@@ -60,6 +61,35 @@ async def chat(self, chat, provider=None, context=None):
                 token = headers["Authorization"].replace("Bearer ", "")
                 headers["x-api-key"] = token
 
+            # Prepare FormData
+            data = aiohttp.FormData()
+            data.add_field("model", model)
+            data.add_field(
+                "file", file_bytes, filename=filename, content_type=mimetypes.guess_type(filename)[0] or "audio/mpeg"
+            )
+
+            ctx.log(f"POST {self.api_url} model={model} file={filename} ({len(file_bytes)} bytes)")
+
+            async with aiohttp.ClientSession() as session, session.post(
+                self.api_url, headers=headers, data=data
+            ) as response:
+                text = await response.text()
+                if response.status != 200:
+                    raise Exception(f"Mistral API Error {response.status}: {text}")
+
+                if context:
+                    context["providerResponse"] = text
+
+                try:
+                    result = json.loads(text)
+                except Exception:
+                    result = {"text": text}  # Fallback
+
+                return result
+
+        async def chat(self, chat, provider=None, context=None):
+            headers = self.get_headers(provider, chat)
+
             model = provider.provider_model(chat["model"]) or chat["model"] or "voxtral-mini-latest"
             # Replace internal alias with actual model name
             if model == "voxtral-mini-transcription":
@@ -110,50 +140,28 @@ async def chat(self, chat, provider=None, context=None):
             except Exception as e:
                 raise Exception(f"Failed to decode audio data: {e}") from e
 
-            # Prepare FormData
-            data = aiohttp.FormData()
-            data.add_field("model", model)
-            data.add_field(
-                "file", file_bytes, filename=filename, content_type=mimetypes.guess_type(filename)[0] or "audio/mpeg"
-            )
-
-            ctx.log(f"POST {self.api_url} model={model} file={filename} ({len(file_bytes)} bytes)")
+            result = await self.transcribe(file_bytes, filename, model=model, headers=headers, context=context)
+            transcription = result.get("text", "")
 
-            async with aiohttp.ClientSession() as session, session.post(
-                self.api_url, headers=headers, data=data
-            ) as response:
-                text = await response.text()
-                if response.status != 200:
-                    raise Exception(f"Mistral API Error {response.status}: {text}")
-
-                context["providerResponse"] = text
-
-                try:
-                    result = json.loads(text)
-                except Exception:
-                    result = {"text": text}  # Fallback
-
-                transcription = result.get("text", "")
-
-                ret = {
-                    "choices": [
-                        {
-                            "message": {
-                                "role": "assistant",
-                                "content": transcription,
-                            }
+            ret = {
+                "choices": [
+                    {
+                        "message": {
+                            "role": "assistant",
+                            "content": transcription,
                         }
-                    ],
-                    "created": result.get("created", int(time.time())),
-                }
+                    }
+                ],
+                "created": result.get("created", int(time.time())),
+            }
 
-                if "model" in result:
-                    ret["model"] = result["model"]
+            if "model" in result:
+                ret["model"] = result["model"]
 
-                if "usage" in result:
-                    ret["usage"] = result["usage"]
+            if "usage" in result:
+                ret["usage"] = result["usage"]
 
-                return ret
+            return ret
 
     class MistralProvider(OpenAiCompatible):
         sdk = "@ai-sdk/mistral"
 
@@ -0,0 +1,126 @@
+# Voice Input Extension
+
+Adds voice-to-text transcription to the chat interface via a microphone button or keyboard shortcut.
+
+## Configuration
+
+Set the `LLMS_VOICE` environment variable to configure which transcription modes are available and in what priority order:
+
+```bash
+export LLMS_VOICE="voxtype,transcribe,voxtral-mini-latest"
+```
+
+The extension tries each mode in order and uses the first one that's available. The default order is `voxtype,transcribe,voxtral-mini-latest`.
+
+## Available Modes
+
+### voxtype
+
+Uses the [voxtype.io](https://voxtype.io) CLI tool for local transcription.
+
+**Requirements:**
+- `voxtype` must be installed and on your `$PATH`
+- `ffmpeg` must be installed for audio format conversion
+
+### transcribe
+
+Uses a custom `transcribe` executable for flexible local transcription. This lets you integrate any speech-to-text tool.
+
+**Requirements:**
+- A `transcribe` executable on your `$PATH` that accepts an audio wav file and outputs text to stdout
+- `ffmpeg` must be installed for audio format conversion
+
+**Interface:**
+```bash
+transcribe recording.wav > transcript.txt
+```
+
+See [Creating a transcribe Script](#creating-a-transcribe-script) for implementation examples.
+
+### voxtral-mini-latest
+
+Uses [Mistral's Voxtral model](https://docs.mistral.ai/models/voxtral-mini-transcribe-26-02) for cloud-based transcription.
+
+**Requirements:**
+- Mistral provider must be enabled in your configuration
+- `MISTRAL_API_KEY` environment variable must be set
+
+**Pricing:** ~$0.003/minute
+
+## Usage
+
+### Microphone Button
+
+Click the microphone icon in the chat input area to start recording. Click again to stop and transcribe.
+
+### Keyboard Shortcut
+
+**Alt+D** toggles voice recording with two modes:
+
+- **Tap (< 500ms):** Toggle mode — starts recording, press again to stop
+- **Hold (≥ 500ms):** Push-to-talk — records while held, stops when released
+
+The transcribed text is appended to the current message input.
+
+---
+
+## Creating a transcribe Script
+
+### Using OpenAI Whisper
+
+Create a script using [uvx](https://github.com/astral-sh/uv) and [openai-whisper](https://github.com/openai/whisper):
+
+```bash
+#!/usr/bin/env bash
+uvx --from openai-whisper whisper "$1" --model base.en --output_format txt --output_dir /tmp >/dev/null 2>&1
+
+BASENAME=$(basename "${1%.*}")
+cat "/tmp/${BASENAME}.txt"
+rm -f "/tmp/${BASENAME}.txt"
+```
+
+### Using Whisper.cpp
+
+[whisper.cpp](https://github.com/ggml-org/whisper.cpp) provides a faster, dependency-free C++ implementation.
+
+**Setup:**
+
+```bash
+git clone https://github.com/ggml-org/whisper.cpp.git
+cd whisper.cpp
+
+# Download a model
+sh ./models/download-ggml-model.sh base.en
+
+# Build
+cmake -B build
+cmake --build build -j --config Release
+
+# Test
+./build/bin/whisper-cli -f samples/jfk.wav
+```
+
+**Create the transcribe script:**
+
+```bash
+#!/usr/bin/env bash
+SCRIPT_DIR="$(cd "$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")" && pwd)"
+MODEL="$SCRIPT_DIR/models/ggml-base.en.bin"
+CLI="$SCRIPT_DIR/build/bin/whisper-cli"
+TMPFILE=$(mktemp /tmp/whisper-XXXXXX)
+
+trap 'rm -f "$TMPFILE" "${TMPFILE}.txt"' EXIT
+
+"$CLI" -m "$MODEL" -otxt -f "$1" -of "$TMPFILE" >/dev/null 2>&1
+
+cat "${TMPFILE}.txt"
+```
+
+### Installation
+
+Make the script executable and add it to your `$PATH`:
+
+```bash
+chmod +x ./transcribe
+sudo ln -s $(pwd)/transcribe /usr/local/bin/transcribe
+```
@@ -0,0 +1,118 @@
+import os
+import re
+import shutil
+
+from aiohttp import web
+
+LLMS_VOICE = os.getenv("LLMS_VOICE", "voxtype,transcribe,voxtral-mini-latest")
+
+
+def install(ctx):
+    voice_options = LLMS_VOICE.split(",")
+    mode = None
+
+    for opt in voice_options:
+        if opt == "voxtype":
+            if not shutil.which("voxtype"):
+                ctx.dbg(f"Cannot use {opt} - voxtype not installed")
+            else:
+                mode = opt
+                break
+        if opt == "transcribe":
+            if not shutil.which("transcribe"):
+                ctx.dbg(f"Cannot use {opt} - transcribe not installed")
+            else:
+                mode = opt
+                break
+        elif opt.startswith("voxtral"):
+            mistral = ctx.config.get("providers", {}).get("mistral")
+            if not mistral or not mistral.get("enabled") or not os.getenv("MISTRAL_API_KEY"):
+                ctx.dbg(f"Cannot use {opt} - Mistral not enabled")
+            else:
+                mode = opt
+                break
+
+    if (mode == "transcribe" or mode == "voxtype") and not shutil.which("ffmpeg"):
+        ctx.dbg(f"Cannot use {mode} - ffmpeg not installed")
+        mode = None
+
+    if not mode:
+        ctx.disabled = True
+        return
+
+    ctx.log(f"Using {mode} for voice")
+
+    async def transcribe_audio(request):
+        """
+        Transcribe audio using Voxtral
+        POST /transcribe
+        """
+        # Get audio data from request
+        data = await request.post()
+        audio_file = data.get("file")
+
+        if not audio_file:
+            raise Exception("No audio file provided")
+
+        # Read audio data
+        audio_bytes = audio_file.file.read()
+
+        if mode == "voxtral-mini-latest":
+            mistral = ctx.get_provider("mistral")
+            result = await mistral.transcription.transcribe(audio_bytes, audio_file.filename)
+            result["mode"] = mode
+            return web.json_response(result)
+
+        # Save to temporary file for voxtype
+        import tempfile
+        from pathlib import Path
+
+        suffix = Path(audio_file.filename).suffix
+        with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as temp_input:
+            temp_input.write(audio_bytes)
+            temp_input_path = temp_input.name
+
+        # Convert to 16kHz WAV using ffmpeg
+        temp_wav_path = temp_input_path + ".wav"
+
+        try:
+            ctx.run_command(
+                ["ffmpeg", "-i", temp_input_path, "-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le", temp_wav_path, "-y"]
+            )
+
+            if mode == "transcribe":
+                result = ctx.run_command(["transcribe", temp_wav_path])
+
+                if result.returncode != 0:
+                    raise Exception(result.stderr)
+
+                text = result.stdout.decode("utf-8").strip()
+                return web.json_response({"text": text, "mode": mode})
+
+            # Run voxtype to transcribe
+            result = ctx.run_command(["voxtype", "transcribe", temp_wav_path])
+
+            ansi_escape = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
+
+            # Extract transcription - take the last non-empty line that isn't a log
+            output_lines = []
+            for line in result.stdout.decode("utf-8").strip().split("\n"):
+                clean_line = ansi_escape.sub("", line).strip()
+                if clean_line and not clean_line.startswith("[") and "INFO" not in clean_line:
+                    output_lines.append(clean_line)
+
+            transcription = output_lines[-1] if output_lines else ""
+
+        finally:
+            # Clean up
+            if os.path.exists(temp_input_path):
+                os.remove(temp_input_path)
+            if os.path.exists(temp_wav_path):
+                os.remove(temp_wav_path)
+
+        return web.json_response({"text": transcription, "mode": mode})
+
+    ctx.add_post("/transcribe", transcribe_audio)
+
+
+__install__ = install