Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
)
from livekit.agents.utils import is_given
from livekit.plugins.openai import LLM as OpenAILLM
from livekit.plugins.openai.llm import ReasoningFormat

from .models import CerebrasChatModels

Expand Down Expand Up @@ -112,6 +113,7 @@ def __init__(
parallel_tool_calls: NotGivenOr[bool] = NOT_GIVEN,
tool_choice: NotGivenOr[ToolChoice] = NOT_GIVEN,
reasoning_effort: NotGivenOr[ReasoningEffort] = NOT_GIVEN,
reasoning_format: NotGivenOr[ReasoningFormat] = NOT_GIVEN,
safety_identifier: NotGivenOr[str] = NOT_GIVEN,
prompt_cache_key: NotGivenOr[str] = NOT_GIVEN,
top_p: NotGivenOr[float] = NOT_GIVEN,
Expand All @@ -126,6 +128,10 @@ def __init__(
``api_key`` must be set to your Cerebras API key, either using the argument or by setting
the ``CEREBRAS_API_KEY`` environmental variable.

``reasoning_format`` controls how reasoning models (e.g. ``gpt-oss-120b``) return their
thinking tokens. Set it to ``"hidden"`` or ``"parsed"`` to keep the model's internal
monologue out of the spoken message content.

When ``gzip_compression`` is True (default), request payloads are gzip-compressed,
which can reduce TTFT for requests with large prompts.

Expand Down Expand Up @@ -167,6 +173,7 @@ def __init__(
parallel_tool_calls=parallel_tool_calls,
tool_choice=tool_choice,
reasoning_effort=reasoning_effort,
reasoning_format=reasoning_format,
safety_identifier=safety_identifier,
prompt_cache_key=prompt_cache_key,
top_p=top_p,
Expand Down
Comment thread
devin-ai-integration[bot] marked this conversation as resolved.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 Caller-provided extra_body in extra_kwargs is silently overridden by opts.extra_body

Pre-existing behavior: if a caller passes extra_kwargs={"extra_body": {...}} to chat() AND the LLM was constructed with an extra_body option, lines 975-980 first apply extra_kwargs then unconditionally overwrite extra["extra_body"] with self._opts.extra_body. This means the caller's extra_body is silently lost. This is not introduced by this PR (it's pre-existing), but the new reasoning_format feature makes it more likely users will interact with extra_body indirectly. Currently no callers in the codebase appear to hit this conflict, but it could surprise external users.

(Refers to lines 975-980)

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

Verbosity = Literal["low", "medium", "high"]
PromptCacheRetention = Literal["in_memory", "24h"]
ReasoningFormat = Literal["parsed", "raw", "hidden"]


@dataclass
Expand All @@ -78,6 +79,7 @@ class _LLMOptions:
max_completion_tokens: NotGivenOr[int]
service_tier: NotGivenOr[str]
reasoning_effort: NotGivenOr[ReasoningEffort]
reasoning_format: NotGivenOr[ReasoningFormat]
verbosity: NotGivenOr[Verbosity]
prompt_cache_retention: NotGivenOr[PromptCacheRetention]
extra_body: NotGivenOr[dict[str, Any]]
Expand Down Expand Up @@ -107,6 +109,7 @@ def __init__(
max_retries: NotGivenOr[int] = NOT_GIVEN,
service_tier: NotGivenOr[str] = NOT_GIVEN,
reasoning_effort: NotGivenOr[ReasoningEffort] = NOT_GIVEN,
reasoning_format: NotGivenOr[ReasoningFormat] = NOT_GIVEN,
verbosity: NotGivenOr[Verbosity] = NOT_GIVEN,
prompt_cache_retention: NotGivenOr[PromptCacheRetention] = NOT_GIVEN,
extra_body: NotGivenOr[dict[str, Any]] = NOT_GIVEN,
Expand All @@ -120,6 +123,12 @@ def __init__(

``api_key`` must be set to your OpenAI API key, either using the argument or by setting the
``OPENAI_API_KEY`` environmental variable.

``reasoning_format`` controls how reasoning models (e.g. ``gpt-oss-120b`` on Cerebras,
or Grok on xAI) return their thinking tokens. Set it to ``"hidden"`` or ``"parsed"`` to
keep the model's internal monologue out of the message content so it isn't spoken by the
TTS pipeline. This is forwarded as a request body field and is only honored by providers
that support it.
"""
super().__init__()

Expand All @@ -140,6 +149,7 @@ def __init__(
max_completion_tokens=max_completion_tokens,
service_tier=service_tier,
reasoning_effort=reasoning_effort,
reasoning_format=reasoning_format,
safety_identifier=safety_identifier,
prompt_cache_key=prompt_cache_key,
top_p=top_p,
Expand Down Expand Up @@ -267,6 +277,7 @@ def with_cerebras(
parallel_tool_calls: NotGivenOr[bool] = NOT_GIVEN,
tool_choice: NotGivenOr[ToolChoice] = NOT_GIVEN,
reasoning_effort: NotGivenOr[ReasoningEffort] = NOT_GIVEN,
reasoning_format: NotGivenOr[ReasoningFormat] = NOT_GIVEN,
safety_identifier: NotGivenOr[str] = NOT_GIVEN,
prompt_cache_key: NotGivenOr[str] = NOT_GIVEN,
top_p: NotGivenOr[float] = NOT_GIVEN,
Expand All @@ -276,6 +287,10 @@ def with_cerebras(

``api_key`` must be set to your Cerebras API key, either using the argument or by setting
the ``CEREBRAS_API_KEY`` environment variable.

``reasoning_format`` controls how Cerebras reasoning models (e.g. ``gpt-oss-120b``) return
their thinking tokens; set it to ``"hidden"`` or ``"parsed"`` to keep reasoning out of the
spoken message content.
"""

api_key = api_key or os.environ.get("CEREBRAS_API_KEY")
Expand All @@ -294,6 +309,7 @@ def with_cerebras(
parallel_tool_calls=parallel_tool_calls,
tool_choice=tool_choice,
reasoning_effort=reasoning_effort,
reasoning_format=reasoning_format,
safety_identifier=safety_identifier,
prompt_cache_key=prompt_cache_key,
top_p=top_p,
Expand Down Expand Up @@ -401,6 +417,7 @@ def with_x_ai(
parallel_tool_calls: NotGivenOr[bool] = NOT_GIVEN,
tool_choice: ToolChoice = "auto",
reasoning_effort: NotGivenOr[ReasoningEffort] = NOT_GIVEN,
reasoning_format: NotGivenOr[ReasoningFormat] = NOT_GIVEN,
safety_identifier: NotGivenOr[str] = NOT_GIVEN,
prompt_cache_key: NotGivenOr[str] = NOT_GIVEN,
top_p: NotGivenOr[float] = NOT_GIVEN,
Expand All @@ -410,6 +427,9 @@ def with_x_ai(

``api_key`` must be set to your XAI API key, either using the argument or by setting
the ``XAI_API_KEY`` environmental variable.

``reasoning_format`` controls how Grok reasoning models return their thinking tokens; set
it to ``"hidden"`` or ``"parsed"`` to keep reasoning out of the spoken message content.
"""
api_key = api_key or os.environ.get("XAI_API_KEY")
if api_key is None:
Expand All @@ -428,6 +448,7 @@ def with_x_ai(
tool_choice=tool_choice,
# TODO(long): add provider fmt for grok
reasoning_effort=reasoning_effort,
reasoning_format=reasoning_format,
safety_identifier=safety_identifier,
prompt_cache_key=prompt_cache_key,
top_p=top_p,
Expand Down Expand Up @@ -982,6 +1003,13 @@ def chat(
if is_given(self._opts.reasoning_effort):
extra["reasoning_effort"] = self._opts.reasoning_effort

if is_given(self._opts.reasoning_format):
# reasoning_format is a provider-specific body field (Cerebras/xAI), so it has to be
# forwarded via extra_body rather than as a top-level OpenAI SDK argument.
extra_body = dict(extra.get("extra_body") or {})
extra_body["reasoning_format"] = self._opts.reasoning_format
extra["extra_body"] = extra_body

if is_given(self._opts.safety_identifier):
extra["safety_identifier"] = self._opts.safety_identifier

Expand Down
82 changes: 82 additions & 0 deletions tests/test_plugin_reasoning_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from __future__ import annotations

import pytest

from livekit.agents.llm import ChatContext
from livekit.plugins.cerebras import LLM as CerebrasLLM
from livekit.plugins.openai import LLM as OpenAILLM

pytestmark = pytest.mark.unit


def _chat_ctx() -> ChatContext:
chat_ctx = ChatContext()
chat_ctx.add_message(role="user", content="hi")
return chat_ctx


@pytest.mark.asyncio
async def test_cerebras_reasoning_format_in_request() -> None:
"""``reasoning_format`` is forwarded to the request body via ``extra_body``."""
llm = CerebrasLLM(
model="gpt-oss-120b",
api_key="test-key",
reasoning_format="hidden",
gzip_compression=False,
msgpack_encoding=False,
)
stream = llm.chat(chat_ctx=_chat_ctx())
try:
extra_body = stream._extra_kwargs.get("extra_body", {})
assert extra_body.get("reasoning_format") == "hidden"
finally:
await stream.aclose()


@pytest.mark.asyncio
async def test_cerebras_reasoning_format_omitted_by_default() -> None:
"""No ``reasoning_format`` is sent when the option is not set."""
llm = CerebrasLLM(
model="gpt-oss-120b",
api_key="test-key",
gzip_compression=False,
msgpack_encoding=False,
)
stream = llm.chat(chat_ctx=_chat_ctx())
try:
extra_body = stream._extra_kwargs.get("extra_body", {})
assert "reasoning_format" not in extra_body
finally:
await stream.aclose()


@pytest.mark.asyncio
async def test_openai_with_cerebras_reasoning_format_in_request() -> None:
"""``LLM.with_cerebras`` forwards ``reasoning_format`` to the request body."""
llm = OpenAILLM.with_cerebras(
model="gpt-oss-120b",
api_key="test-key",
reasoning_format="hidden",
)
stream = llm.chat(chat_ctx=_chat_ctx())
try:
extra_body = stream._extra_kwargs.get("extra_body", {})
assert extra_body.get("reasoning_format") == "hidden"
finally:
await stream.aclose()


@pytest.mark.asyncio
async def test_xai_reasoning_format_in_request() -> None:
"""``LLM.with_x_ai`` forwards ``reasoning_format`` to the request body."""
llm = OpenAILLM.with_x_ai(
model="grok-4-1-fast-reasoning",
api_key="test-key",
reasoning_format="parsed",
)
stream = llm.chat(chat_ctx=_chat_ctx())
try:
extra_body = stream._extra_kwargs.get("extra_body", {})
assert extra_body.get("reasoning_format") == "parsed"
finally:
await stream.aclose()