Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2411,6 +2411,7 @@ def _on_first_frame(fut: asyncio.Future[float] | asyncio.Future[None]) -> None:
except BaseException:
return

speech_handle._mark_generation_playout_started()
self._session._update_agent_state(
"speaking",
start_time=started_speaking_at,
Expand Down Expand Up @@ -2805,6 +2806,7 @@ def _on_first_frame(
)
self._session._early_assistant_metrics = early_metrics

speech_handle._mark_generation_playout_started()
self._session._update_agent_state(
"speaking",
start_time=started_speaking_at,
Expand Down Expand Up @@ -3297,6 +3299,7 @@ def _on_first_frame(
except BaseException:
return

speech_handle._mark_generation_playout_started()
self._session._update_agent_state(
"speaking",
start_time=started_speaking_at,
Expand Down Expand Up @@ -3754,6 +3757,22 @@ def _on_false_interruption() -> None:
self._paused_speech = None
return

if not self._paused_speech.handle._generation_playout_started:
# The current generation step never started audio playout (e.g. a
# silent tool-call step). The pause was recorded preemptively, so
# there is nothing to "resume" — undo the preemptive pause without
# emitting a false-interruption resume that would otherwise leak
# stale playout state into a later step.
if (
self._session.options.interruption["resume_false_interruption"]
and (audio_output := self._session.output.audio)
and audio_output.can_pause
):
audio_output.resume()
self._paused_speech = None
self._false_interruption_timer = None
return

resumed = False
if (
self._session.options.interruption["resume_false_interruption"]
Expand Down
24 changes: 24 additions & 0 deletions livekit-agents/livekit/agents/voice/speech_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@ def __init__(
self._num_steps = 1
self._agent_turn_context: otel_context.Context | None = None

# per generation-step playout state.
#
# ``SpeechHandle`` is the public whole-turn handle, but a multi-step turn
# (e.g. a silent tool-call step followed by a tool-reply step) reuses the
# same handle across steps. ``_playout_started`` is scoped to the current
# step: it becomes ``True`` once the step produces audio playout and is
# reset on every step advance (see ``_authorize_generation``). It lets the
# runtime tell whether the *current* step is actually playing audio, so
# pause/false-interruption bookkeeping captured during a silent step does
# not leak into a later step.
self._playout_started = False

self._interrupt_timeout_handle: asyncio.TimerHandle | None = None

self._item_added_callbacks: set[Callable[[llm.ChatItem], None]] = set()
Expand Down Expand Up @@ -246,9 +258,21 @@ def _item_added(self, items: Sequence[llm.ChatItem]) -> None:

self._chat_items.append(item)

@property
def _generation_playout_started(self) -> bool:
"""Whether the current generation step has started audio playout."""
return self._playout_started

def _mark_generation_playout_started(self) -> None:
self._playout_started = True

def _authorize_generation(self) -> None:
fut = asyncio.Future[None]()
self._generations.append(fut)
# a new generation step starts with no playout; pause/false-interruption
# state captured during the previous step must not be treated as belonging
# to this one.
self._playout_started = False
self._authorize_event.set()

def _clear_authorization(self) -> None:
Expand Down
73 changes: 73 additions & 0 deletions tests/test_agent_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1398,6 +1398,79 @@ async def test_silent_tool_call_pause_state_does_not_leak_into_tool_reply() -> N
assert false_interruption_events[-1].resumed is True


async def test_silent_tool_call_step_does_not_emit_false_interruption_resume() -> None:
"""A silent tool-call step must not emit a false-interruption resume.

``SpeechHandle`` is reused across the generation steps of a single turn. When
a silent tool-call step (no audio playout) records a preemptive pause and the
false-interruption timer fires *while that step is still active*, the runtime
must not report a false-interruption *resume*: nothing was ever played out for
this step, so resuming would leak stale per-step playout state. The pause is
undone silently and the later tool-reply step resumes playout on its own.
"""
speed = 1
actions = FakeActions()
actions.add_user_speech(0.1, 0.2, "What's the weather in Tokyo?", stt_delay=0.05)

# Silent tool-call step: no spoken preamble/audio. The step stays active long
# enough that the false-interruption timer fires before it finishes.
actions.add_llm(
content="",
tool_calls=[
FunctionToolCall(
name="get_weather",
arguments='{"location": "Tokyo"}',
call_id="1",
)
],
ttft=0.05,
duration=2.0,
)

# VAD-only speech that starts *and ends* during the silent tool-call step,
# so the false-interruption timer is scheduled and fires within that step.
actions.add_user_speech(0.85, 1.05, "", stt_delay=0.05)

actions.add_llm(
content="The weather in Tokyo is sunny today.",
input="The weather in Tokyo is sunny today.",
ttft=0.0,
duration=0.0,
)
actions.add_tts(0.5, ttfb=0.0, duration=0.0)

session = create_session(
actions,
speed_factor=speed,
can_pause_audio=True,
turn_handling={"interruption": {"false_interruption_timeout": 0.3 / speed}},
)
agent = MyAgent()

agent_state_events: list[AgentStateChangedEvent] = []
false_interruption_events: list[AgentFalseInterruptionEvent] = []

session.on("agent_state_changed", agent_state_events.append)
session.on("agent_false_interruption", false_interruption_events.append)

await asyncio.wait_for(
run_session(session, agent, drain_delay=1.5 / speed),
timeout=SESSION_TIMEOUT,
)

transitions = [(ev.old_state, ev.new_state) for ev in agent_state_events]

# The tool reply still plays out: the silent step transitions straight to
# speaking when the tool-reply audio starts.
assert ("thinking", "speaking") in transitions

# Before the fix, the false-interruption timer firing during the silent
# tool-call step emitted a spurious resume event even though no audio ever
# played for that step. After the fix, no such resume is reported because
# the current generation step never started playout.
assert not any(ev.resumed for ev in false_interruption_events)


class FlushMultiSegmentAgent(Agent):
"""Agent whose llm_node flushes the reply into two segments via FlushSentinel."""

Expand Down