-
Notifications
You must be signed in to change notification settings - Fork 3.2k
feat(plugins): add FunASR self-hosted STT plugin #6129
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| # LiveKit Plugins FunASR | ||
|
|
||
| Self-hosted speech-to-text for LiveKit Agents using [FunASR](https://github.com/modelscope/FunASR) — SenseVoice, Paraformer, Fun-ASR-Nano. Runs **locally, no cloud API**, strong on Chinese and 50+ languages. | ||
|
|
||
| ## Install | ||
| ```bash | ||
| pip install livekit-plugins-funasr | ||
| ``` | ||
|
|
||
| ## Usage | ||
| ```python | ||
| from livekit.plugins import funasr | ||
|
|
||
| # ModelScope (default hub="ms") | ||
| stt = funasr.STT(model="iic/SenseVoiceSmall", device="cuda") | ||
|
|
||
| # HuggingFace | ||
| stt = funasr.STT(model="FunAudioLLM/SenseVoiceSmall", hub="hf", device="cuda") | ||
| ``` | ||
|
|
||
| Non-streaming STT; LiveKit wraps it with a VAD `StreamAdapter` for real-time agents. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,15 @@ | ||
| """FunASR plugin for LiveKit Agents — self-hosted speech-to-text (SenseVoice / Paraformer / Fun-ASR-Nano).""" | ||
| from livekit.agents import Plugin | ||
| from .log import logger | ||
| from .stt import STT | ||
| from .version import __version__ | ||
|
Check failure on line 5 in livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/__init__.py
|
||
|
|
||
| __all__ = ["STT", "__version__"] | ||
|
|
||
|
|
||
| class FunASRPlugin(Plugin): | ||
| def __init__(self) -> None: | ||
| super().__init__(__name__, __version__, __package__, logger) | ||
|
|
||
|
|
||
| Plugin.register_plugin(FunASRPlugin()) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| import logging | ||
|
Check failure on line 1 in livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/log.py
|
||
| logger = logging.getLogger("livekit.plugins.funasr") | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import asyncio | ||
| import io | ||
| from dataclasses import dataclass | ||
|
|
||
| import numpy as np | ||
|
Check failure on line 7 in livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/stt.py
|
||
|
|
||
| from livekit import rtc | ||
| from livekit.agents import ( | ||
| DEFAULT_API_CONNECT_OPTIONS, | ||
| APIConnectionError, | ||
| APIConnectOptions, | ||
| stt, | ||
| ) | ||
| from livekit.agents.stt import SpeechEventType, STTCapabilities | ||
| from livekit.agents.types import NOT_GIVEN, NotGivenOr | ||
| from livekit.agents.utils import AudioBuffer, is_given | ||
|
|
||
| from .log import logger | ||
|
|
||
| _DEFAULT_MODEL = "iic/SenseVoiceSmall" | ||
| _TARGET_SR = 16000 | ||
|
|
||
|
|
||
| @dataclass | ||
| class _STTOptions: | ||
| model: str = _DEFAULT_MODEL | ||
| language: str = "auto" | ||
| device: str = "cpu" | ||
| hub: str = "ms" | ||
| use_itn: bool = True | ||
|
|
||
|
|
||
| class STT(stt.STT): | ||
| """FunASR self-hosted speech-to-text. | ||
|
|
||
| Runs FunASR models (SenseVoice, Paraformer, Fun-ASR-Nano) locally — no cloud | ||
| API. Non-streaming; LiveKit wraps it with a VAD StreamAdapter for agents. | ||
| """ | ||
|
|
||
| def __init__( | ||
| self, | ||
| *, | ||
| model: str = _DEFAULT_MODEL, | ||
| language: str = "auto", | ||
| device: str = "cpu", | ||
| hub: str = "ms", | ||
| use_itn: bool = True, | ||
| vad_model: str | None = "fsmn-vad", | ||
| ) -> None: | ||
| super().__init__(capabilities=STTCapabilities(streaming=False, interim_results=False)) | ||
| self._opts = _STTOptions(model=model, language=language, device=device, hub=hub, use_itn=use_itn) | ||
| self._vad_model = vad_model | ||
| self._model = None | ||
|
|
||
| def _ensure_model(self): | ||
| if self._model is None: | ||
| from funasr import AutoModel | ||
|
|
||
| kwargs = dict(model=self._opts.model, device=self._opts.device, hub=self._opts.hub, disable_update=True) | ||
|
Check failure on line 61 in livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/stt.py
|
||
| if self._vad_model: | ||
| kwargs.update(vad_model=self._vad_model, vad_kwargs={"max_single_segment_time": 30000}) | ||
| logger.info("loading FunASR model %s on %s", self._opts.model, self._opts.device) | ||
| self._model = AutoModel(**kwargs) | ||
| return self._model | ||
|
Comment on lines
+57
to
+66
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🔴 No thread-safety for lazy model init and concurrent inference in thread executor
Prompt for agentsWas this helpful? React with 👍 or 👎 to provide feedback. |
||
|
|
||
| async def _recognize_impl( | ||
| self, | ||
| buffer: AudioBuffer, | ||
| *, | ||
| language: NotGivenOr[str] = NOT_GIVEN, | ||
| conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS, | ||
| ) -> stt.SpeechEvent: | ||
| lang = language if is_given(language) else self._opts.language | ||
| wav_bytes = rtc.combine_audio_frames(buffer).to_wav_bytes() | ||
|
|
||
| def _run() -> str: | ||
| import soundfile as sf | ||
| from funasr.utils.postprocess_utils import rich_transcription_postprocess | ||
|
Check failure on line 80 in livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/stt.py
|
||
|
|
||
| model = self._ensure_model() | ||
| audio, sr = sf.read(io.BytesIO(wav_bytes), dtype="float32") | ||
| if audio.ndim > 1: | ||
| audio = audio.mean(axis=1) | ||
| if sr != _TARGET_SR: | ||
| import librosa | ||
|
|
||
| audio = librosa.resample(audio, orig_sr=sr, target_sr=_TARGET_SR) | ||
| gen_kwargs = dict(input=audio, cache={}, use_itn=self._opts.use_itn, batch_size_s=300) | ||
|
Check failure on line 90 in livekit-plugins/livekit-plugins-funasr/livekit/plugins/funasr/stt.py
|
||
| if "SenseVoice" in self._opts.model or (lang and lang != "auto"): | ||
| gen_kwargs["language"] = lang | ||
| res = model.generate(**gen_kwargs) | ||
| text = res[0]["text"] if res else "" | ||
| return rich_transcription_postprocess(text) | ||
|
|
||
| try: | ||
| text = await asyncio.get_event_loop().run_in_executor(None, _run) | ||
| except Exception as e: # noqa: BLE001 | ||
| raise APIConnectionError() from e | ||
|
Comment on lines
+97
to
+100
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🚩 Broad exception catch masks local errors as retriable APIConnectionError At Was this helpful? React with 👍 or 👎 to provide feedback.
Comment on lines
+99
to
+100
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 Blanket At lines 99-100, every exception (including Prompt for agentsWas this helpful? React with 👍 or 👎 to provide feedback. |
||
|
|
||
| return stt.SpeechEvent( | ||
| type=SpeechEventType.FINAL_TRANSCRIPT, | ||
| alternatives=[stt.SpeechData(text=text, language=str(lang))], | ||
| ) | ||
|
Comment on lines
+102
to
+105
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🚩 Language When using the default SenseVoice model with default language Was this helpful? React with 👍 or 👎 to provide feedback. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| __version__ = "0.1.0" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| [build-system] | ||
| requires = ["hatchling"] | ||
| build-backend = "hatchling.build" | ||
|
|
||
| [project] | ||
| name = "livekit-plugins-funasr" | ||
| dynamic = ["version"] | ||
| description = "FunASR (SenseVoice / Paraformer / Fun-ASR-Nano) self-hosted STT plugin for LiveKit Agents" | ||
| readme = "README.md" | ||
| license = "Apache-2.0" | ||
| requires-python = ">=3.10.0" | ||
| authors = [{ name = "LiveKit", email = "hello@livekit.io" }] | ||
| keywords = ["voice", "ai", "realtime", "audio", "livekit", "funasr", "speech-to-text", "asr"] | ||
| classifiers = [ | ||
| "Intended Audience :: Developers", | ||
| "License :: OSI Approved :: Apache Software License", | ||
| "Topic :: Multimedia :: Sound/Audio", | ||
| "Topic :: Scientific/Engineering :: Artificial Intelligence", | ||
| "Programming Language :: Python :: 3", | ||
| "Programming Language :: Python :: 3 :: Only", | ||
| ] | ||
| dependencies = ["livekit-agents>=1.6.0", "funasr>=1.1.0", "soundfile", "librosa"] | ||
|
|
||
| [project.urls] | ||
| Documentation = "https://docs.livekit.io" | ||
| Website = "https://livekit.io/" | ||
| Source = "https://github.com/livekit/agents" | ||
|
|
||
| [tool.hatch.version] | ||
| path = "livekit/plugins/funasr/version.py" | ||
|
|
||
| [tool.hatch.build.targets.wheel] | ||
| packages = ["livekit"] | ||
|
|
||
| [tool.hatch.build.targets.sdist] | ||
| include = ["/livekit"] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🚩 Missing model and provider property overrides
The base
STTclass atlivekit-agents/livekit/agents/stt/stt.py:161-182definesmodelandproviderproperties that return"unknown"by default, with docstrings explicitly stating plugins should override them. Other STT plugins (deepgram atstt.py:199-203, openai atstt.py:199-203, fal atstt.py:47-52) all override these properties. This FunASR plugin does not, meaning metrics emitted by the base class (atstt.py:220-221) will reportmodel_name="unknown"andmodel_provider="unknown", reducing observability. This is not a correctness bug but an incomplete integration.Was this helpful? React with 👍 or 👎 to provide feedback.