Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions examples/voice_agents/voxcpm_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import logging
import os

from dotenv import load_dotenv

from livekit.agents import (
Agent,
AgentServer,
AgentSession,
JobContext,
JobProcess,
MetricsCollectedEvent,
cli,
inference,
metrics,
room_io,
)
from livekit.plugins import silero, voxcpm
from livekit.plugins.turn_detector.multilingual import MultilingualModel

logger = logging.getLogger("voxcpm-agent")

load_dotenv()


class VoxcpmAssistant(Agent):
def __init__(self) -> None:
super().__init__(
instructions=(
"You are a helpful voice assistant powered by VoxCPM2 on vLLM-Omni. "
"Keep responses concise and conversational."
),
)

async def on_enter(self) -> None:
self.session.generate_reply(
instructions="Greet the user briefly and mention that you are using VoxCPM2 TTS."
)


server = AgentServer()


def prewarm(proc: JobProcess) -> None:
proc.userdata["vad"] = silero.VAD.load()


server.setup_fnc = prewarm


@server.rtc_session()
async def entrypoint(ctx: JobContext) -> None:
ctx.log_context_fields = {"room": ctx.room.name}

session = AgentSession(
stt=inference.STT("deepgram/nova-3", language="multi"),
llm=inference.LLM("openai/gpt-4.1-mini"),
tts=voxcpm.TTS(
base_url=os.getenv("VLLM_OMNI_URL", "http://127.0.0.1:8800/v1"),
model=os.getenv("VLLM_OMNI_MODEL", "openbmb/VoxCPM2"),
voice=os.getenv("VOXCPM_VOICE", "default"),
),
vad=ctx.proc.userdata["vad"],
turn_detection=MultilingualModel(),
)

@session.on("metrics_collected")
def _on_metrics_collected(ev: MetricsCollectedEvent) -> None:
metrics.log_metrics(ev.metrics)

await session.start(
agent=VoxcpmAssistant(),
room=ctx.room,
room_options=room_io.RoomOptions(),
)


if __name__ == "__main__":
cli.run_app(server)
1 change: 1 addition & 0 deletions livekit-agents/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ tavus = ["livekit-plugins-tavus>=1.6.0"]
trugen = ["livekit-plugins-trugen>=1.6.0"]
turn-detector = ["livekit-plugins-turn-detector>=1.6.0"]
ultravox = ["livekit-plugins-ultravox>=1.6.0"]
voxcpm = ["livekit-plugins-voxcpm>=1.6.0"]
upliftai = ["livekit-plugins-upliftai>=1.6.0"]
gradium = ["livekit-plugins-gradium>=1.6.0"]
xai = ["livekit-plugins-xai>=1.6.0"]
Expand Down
73 changes: 73 additions & 0 deletions livekit-plugins/livekit-plugins-voxcpm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# livekit-plugins-voxcpm

LiveKit Agents plugin for [VoxCPM2](https://huggingface.co/openbmb/VoxCPM2) served through [vLLM-Omni](https://github.com/vllm-project/vllm-omni).

## Requirements

- A running vLLM-Omni server with VoxCPM2 loaded, exposing the OpenAI-compatible Speech API.
- Python >= 3.10

Start a server:

```bash
vllm serve openbmb/VoxCPM2 --omni --host 0.0.0.0 --port 8800
```

## Install

```bash
pip install livekit-plugins-voxcpm
```

Or from the monorepo workspace:

```bash
uv sync --all-extras --dev
```

## Usage

```python
from livekit.plugins import voxcpm

tts = voxcpm.TTS(
base_url="http://127.0.0.1:8800/v1",
model="openbmb/VoxCPM2",
voice="default",
)
```

### Environment variables

| Variable | Default | Description |
|----------|---------|-------------|
| `VLLM_OMNI_URL` | `http://127.0.0.1:8800/v1` | vLLM-Omni OpenAI base URL |
| `VLLM_OMNI_MODEL` | `openbmb/VoxCPM2` | Model id |
| `VOXCPM_VOICE` | `default` | Preset or uploaded voice name |
| `VLLM_API_KEY` | unset | Optional bearer token |

### Voice cloning

Pass a reference clip at construction time or use a pre-uploaded voice via `POST /v1/audio/voices` on the server:

```python
tts = voxcpm.TTS(
voice="my_speaker",
ref_audio="/path/to/reference.wav",
ref_text="Transcript of the reference clip.",
)
```

Voice design prefixes such as `(A warm female voice)Hello!` are passed through as plain text to the backend.

## API surface

- `synthesize(text)` uses HTTP streaming PCM (`POST /v1/audio/speech`).
- `stream()` uses the WebSocket endpoint (`/v1/audio/speech/stream`) for low-latency agent pipelines.

Output is mono 16-bit PCM at **48 kHz**.

## Links

- [vLLM-Omni Speech API](https://docs.vllm.ai/projects/vllm-omni/en/latest/serving/speech_api/)
- [VoxCPM vLLM-Omni deployment guide](https://voxcpm.readthedocs.io/en/latest/deployment/vllm_omni.html)
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2023 LiveKit, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""VoxCPM2 plugin for LiveKit Agents via vLLM-Omni."""

from .tts import TTS, ChunkedStream, SynthesizeStream
from .version import __version__

__all__ = ["TTS", "ChunkedStream", "SynthesizeStream", "__version__"]

from livekit.agents import Plugin

from .log import logger


class VoxcpmPlugin(Plugin):
def __init__(self) -> None:
super().__init__(__name__, __version__, __package__, logger)


Plugin.register_plugin(VoxcpmPlugin())

_module = dir()
NOT_IN_ALL = [m for m in _module if m not in __all__]

__pdoc__ = {}

for n in NOT_IN_ALL:
__pdoc__[n] = False
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from __future__ import annotations

import base64
from pathlib import Path


def encode_audio_file(path: Path) -> str:
ext = path.suffix.lower().lstrip(".")
mime = {
"wav": "audio/wav",
"mp3": "audio/mpeg",
"flac": "audio/flac",
"ogg": "audio/ogg",
}.get(ext, "audio/wav")
data = base64.b64encode(path.read_bytes()).decode("ascii")
return f"data:{mime};base64,{data}"


def normalize_ref_audio(ref_audio: str | Path) -> str:
if isinstance(ref_audio, Path):
return encode_audio_file(ref_audio)
value = str(ref_audio)
if value.startswith("data:") or value.startswith("http://") or value.startswith("https://"):
return value
return encode_audio_file(Path(value))
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import logging

logger = logging.getLogger("livekit.plugins.voxcpm")
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import Literal

TTSModels = Literal["openbmb/VoxCPM2"] | str
TTSVoices = Literal["default"] | str

DEFAULT_MODEL: TTSModels = "openbmb/VoxCPM2"
DEFAULT_VOICE: TTSVoices = "default"
DEFAULT_SAMPLE_RATE = 48_000
NUM_CHANNELS = 1
Empty file.
Loading