livekit · tamerrkanak · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
@@ -0,0 +1,79 @@
+import logging
+import os
+
+from dotenv import load_dotenv
+
+from livekit.agents import (
+    Agent,
+    AgentServer,
+    AgentSession,
+    JobContext,
+    JobProcess,
+    MetricsCollectedEvent,
+    cli,
+    inference,
+    metrics,
+    room_io,
+)
+from livekit.plugins import silero, voxcpm
+from livekit.plugins.turn_detector.multilingual import MultilingualModel
+
+logger = logging.getLogger("voxcpm-agent")
+
+load_dotenv()
+
+
+class VoxcpmAssistant(Agent):
+    def __init__(self) -> None:
+        super().__init__(
+            instructions=(
+                "You are a helpful voice assistant powered by VoxCPM2 on vLLM-Omni. "
+                "Keep responses concise and conversational."
+            ),
+        )
+
+    async def on_enter(self) -> None:
+        self.session.generate_reply(
+            instructions="Greet the user briefly and mention that you are using VoxCPM2 TTS."
+        )
+
+
+server = AgentServer()
+
+
+def prewarm(proc: JobProcess) -> None:
+    proc.userdata["vad"] = silero.VAD.load()
+
+
+server.setup_fnc = prewarm
+
+
+@server.rtc_session()
+async def entrypoint(ctx: JobContext) -> None:
+    ctx.log_context_fields = {"room": ctx.room.name}
+
+    session = AgentSession(
+        stt=inference.STT("deepgram/nova-3", language="multi"),
+        llm=inference.LLM("openai/gpt-4.1-mini"),
+        tts=voxcpm.TTS(
+            base_url=os.getenv("VLLM_OMNI_URL", "http://127.0.0.1:8800/v1"),
+            model=os.getenv("VLLM_OMNI_MODEL", "openbmb/VoxCPM2"),
+            voice=os.getenv("VOXCPM_VOICE", "default"),
+        ),
+        vad=ctx.proc.userdata["vad"],
+        turn_detection=MultilingualModel(),
+    )
+
+    @session.on("metrics_collected")
+    def _on_metrics_collected(ev: MetricsCollectedEvent) -> None:
+        metrics.log_metrics(ev.metrics)
+
+    await session.start(
+        agent=VoxcpmAssistant(),
+        room=ctx.room,
+        room_options=room_io.RoomOptions(),
+    )
+
+
+if __name__ == "__main__":
+    cli.run_app(server)
@@ -124,6 +124,7 @@ tavus = ["livekit-plugins-tavus>=1.6.0"]
 trugen = ["livekit-plugins-trugen>=1.6.0"]
 turn-detector = ["livekit-plugins-turn-detector>=1.6.0"]
 ultravox = ["livekit-plugins-ultravox>=1.6.0"]
+voxcpm = ["livekit-plugins-voxcpm>=1.6.0"]
 upliftai = ["livekit-plugins-upliftai>=1.6.0"]
 gradium = ["livekit-plugins-gradium>=1.6.0"]
 xai = ["livekit-plugins-xai>=1.6.0"]

@@ -0,0 +1,73 @@
+# livekit-plugins-voxcpm
+
+LiveKit Agents plugin for [VoxCPM2](https://huggingface.co/openbmb/VoxCPM2) served through [vLLM-Omni](https://github.com/vllm-project/vllm-omni).
+
+## Requirements
+
+- A running vLLM-Omni server with VoxCPM2 loaded, exposing the OpenAI-compatible Speech API.
+- Python >= 3.10
+
+Start a server:
+
+```bash
+vllm serve openbmb/VoxCPM2 --omni --host 0.0.0.0 --port 8800
+```
+
+## Install
+
+```bash
+pip install livekit-plugins-voxcpm
+```
+
+Or from the monorepo workspace:
+
+```bash
+uv sync --all-extras --dev
+```
+
+## Usage
+
+```python
+from livekit.plugins import voxcpm
+
+tts = voxcpm.TTS(
+    base_url="http://127.0.0.1:8800/v1",
+    model="openbmb/VoxCPM2",
+    voice="default",
+)
+```
+
+### Environment variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `VLLM_OMNI_URL` | `http://127.0.0.1:8800/v1` | vLLM-Omni OpenAI base URL |
+| `VLLM_OMNI_MODEL` | `openbmb/VoxCPM2` | Model id |
+| `VOXCPM_VOICE` | `default` | Preset or uploaded voice name |
+| `VLLM_API_KEY` | unset | Optional bearer token |
+
+### Voice cloning
+
+Pass a reference clip at construction time or use a pre-uploaded voice via `POST /v1/audio/voices` on the server:
+
+```python
+tts = voxcpm.TTS(
+    voice="my_speaker",
+    ref_audio="/path/to/reference.wav",
+    ref_text="Transcript of the reference clip.",
+)
+```
+
+Voice design prefixes such as `(A warm female voice)Hello!` are passed through as plain text to the backend.
+
+## API surface
+
+- `synthesize(text)` uses HTTP streaming PCM (`POST /v1/audio/speech`).
+- `stream()` uses the WebSocket endpoint (`/v1/audio/speech/stream`) for low-latency agent pipelines.
+
+Output is mono 16-bit PCM at **48 kHz**.
+
+## Links
+
+- [vLLM-Omni Speech API](https://docs.vllm.ai/projects/vllm-omni/en/latest/serving/speech_api/)
+- [VoxCPM vLLM-Omni deployment guide](https://voxcpm.readthedocs.io/en/latest/deployment/vllm_omni.html)
@@ -0,0 +1,40 @@
+# Copyright 2023 LiveKit, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""VoxCPM2 plugin for LiveKit Agents via vLLM-Omni."""
+
+from .tts import TTS, ChunkedStream, SynthesizeStream
+from .version import __version__
+
+__all__ = ["TTS", "ChunkedStream", "SynthesizeStream", "__version__"]
+
+from livekit.agents import Plugin
+
+from .log import logger
+
+
+class VoxcpmPlugin(Plugin):
+    def __init__(self) -> None:
+        super().__init__(__name__, __version__, __package__, logger)
+
+
+Plugin.register_plugin(VoxcpmPlugin())
+
+_module = dir()
+NOT_IN_ALL = [m for m in _module if m not in __all__]
+
+__pdoc__ = {}
+
+for n in NOT_IN_ALL:
+    __pdoc__[n] = False
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+import base64
+from pathlib import Path
+
+
+def encode_audio_file(path: Path) -> str:
+    ext = path.suffix.lower().lstrip(".")
+    mime = {
+        "wav": "audio/wav",
+        "mp3": "audio/mpeg",
+        "flac": "audio/flac",
+        "ogg": "audio/ogg",
+    }.get(ext, "audio/wav")
+    data = base64.b64encode(path.read_bytes()).decode("ascii")
+    return f"data:{mime};base64,{data}"
+
+
+def normalize_ref_audio(ref_audio: str | Path) -> str:
+    if isinstance(ref_audio, Path):
+        return encode_audio_file(ref_audio)
+    value = str(ref_audio)
+    if value.startswith("data:") or value.startswith("http://") or value.startswith("https://"):
+        return value
+    return encode_audio_file(Path(value))
@@ -0,0 +1,3 @@
+import logging
+
+logger = logging.getLogger("livekit.plugins.voxcpm")
@@ -0,0 +1,9 @@
+from typing import Literal
+
+TTSModels = Literal["openbmb/VoxCPM2"] | str
+TTSVoices = Literal["default"] | str
+
+DEFAULT_MODEL: TTSModels = "openbmb/VoxCPM2"
+DEFAULT_VOICE: TTSVoices = "default"
+DEFAULT_SAMPLE_RATE = 48_000
+NUM_CHANNELS = 1
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		import logging

		logger = logging.getLogger("livekit.plugins.voxcpm")