Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def __post_init__(self) -> None:
class STTOptions:
"""Configuration options for Soniox Speech-to-Text service."""

model: str = "stt-rt-v4"
model: str = "stt-rt-v5"

language_hints: list[str] | None = None
language_hints_strict: bool = False
Expand All @@ -119,17 +119,25 @@ class STTOptions:
enable_speaker_diarization: bool = False
enable_language_identification: bool = True

max_endpoint_delay_ms: int = 500
max_endpoint_delay_ms: int = 2000

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 Breaking default change: max_endpoint_delay_ms 500 → 2000

The default max_endpoint_delay_ms changed from 500 to 2000. This is a 4× increase in the maximum endpoint detection delay, meaning existing users who rely on the default will experience noticeably later speech finalization. While this appears intentional for the v5 model, it is a behavioral breaking change for any caller that constructs STTOptions() without explicitly setting this field. The livekit-plugins-inworld plugin references soniox/stt-rt-v4 in comments (livekit-plugins/livekit-plugins-inworld/livekit/plugins/inworld/stt.py:55) — that plugin may also need updating if it depends on these defaults.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mihafabcic-soniox could you provide more context on why you changed the default here?

"""Maximum delay in milliseconds between speech cessation and endpoint detection.
Range: 500–3000.
See: https://soniox.com/docs/stt/rt/endpoint-detection"""

endpoint_sensitivity: float | None = None
"""How readily the model emits speech endpoints. Range: -1.0 to 1.0.
Higher values make endpoints more likely (finalize sooner); lower values make them
less likely. Leave as None to use the server-side default.
Introduced in the Soniox v5 model; earlier models reject it."""

client_reference_id: str | None = None
translation: TranslationConfig | None = None

def __post_init__(self) -> None:
if not (500 <= self.max_endpoint_delay_ms <= 3000):
raise ValueError("max_endpoint_delay_ms must be between 500 and 3000")
if self.endpoint_sensitivity is not None and not (-1.0 <= self.endpoint_sensitivity <= 1.0):
raise ValueError("endpoint_sensitivity must be between -1.0 and 1.0")


class STT(stt.STT):
Expand Down Expand Up @@ -261,6 +269,8 @@ async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
"client_reference_id": self._stt._params.client_reference_id,
}
config["max_endpoint_delay_ms"] = self._stt._params.max_endpoint_delay_ms
if self._stt._params.endpoint_sensitivity is not None:
config["endpoint_sensitivity"] = self._stt._params.endpoint_sensitivity
if self._stt._params.translation is not None:
tr = self._stt._params.translation
translation_dict: dict[str, Any] = {"type": tr.type}
Expand Down