livekit · anshulkulhari7 · Jun 15, 2026
@@ -2411,6 +2411,7 @@ def _on_first_frame(fut: asyncio.Future[float] | asyncio.Future[None]) -> None:
             except BaseException:
                 return
 
+            speech_handle._mark_generation_playout_started()
             self._session._update_agent_state(
                 "speaking",
                 start_time=started_speaking_at,
@@ -2805,6 +2806,7 @@ def _on_first_frame(
                 )
             self._session._early_assistant_metrics = early_metrics
 
+            speech_handle._mark_generation_playout_started()
             self._session._update_agent_state(
                 "speaking",
                 start_time=started_speaking_at,
@@ -3297,6 +3299,7 @@ def _on_first_frame(
             except BaseException:
                 return
 
+            speech_handle._mark_generation_playout_started()
             self._session._update_agent_state(
                 "speaking",
                 start_time=started_speaking_at,
@@ -3754,6 +3757,22 @@ def _on_false_interruption() -> None:
                 self._paused_speech = None
                 return
 
+            if not self._paused_speech.handle._generation_playout_started:
+                # The current generation step never started audio playout (e.g. a
+                # silent tool-call step). The pause was recorded preemptively, so
+                # there is nothing to "resume" — undo the preemptive pause without
+                # emitting a false-interruption resume that would otherwise leak
+                # stale playout state into a later step.
+                if (
+                    self._session.options.interruption["resume_false_interruption"]
+                    and (audio_output := self._session.output.audio)
+                    and audio_output.can_pause
+                ):
+                    audio_output.resume()
+                self._paused_speech = None
+                self._false_interruption_timer = None
+                return
+
             resumed = False
             if (
                 self._session.options.interruption["resume_false_interruption"]

@@ -50,6 +50,18 @@ def __init__(
         self._num_steps = 1
         self._agent_turn_context: otel_context.Context | None = None
 
+        # per generation-step playout state.
+        #
+        # ``SpeechHandle`` is the public whole-turn handle, but a multi-step turn
+        # (e.g. a silent tool-call step followed by a tool-reply step) reuses the
+        # same handle across steps. ``_playout_started`` is scoped to the current
+        # step: it becomes ``True`` once the step produces audio playout and is
+        # reset on every step advance (see ``_authorize_generation``). It lets the
+        # runtime tell whether the *current* step is actually playing audio, so
+        # pause/false-interruption bookkeeping captured during a silent step does
+        # not leak into a later step.
+        self._playout_started = False
+
         self._interrupt_timeout_handle: asyncio.TimerHandle | None = None
 
         self._item_added_callbacks: set[Callable[[llm.ChatItem], None]] = set()
@@ -246,9 +258,21 @@ def _item_added(self, items: Sequence[llm.ChatItem]) -> None:
 
             self._chat_items.append(item)
 
+    @property
+    def _generation_playout_started(self) -> bool:
+        """Whether the current generation step has started audio playout."""
+        return self._playout_started
+
+    def _mark_generation_playout_started(self) -> None:
+        self._playout_started = True
+
     def _authorize_generation(self) -> None:
         fut = asyncio.Future[None]()
         self._generations.append(fut)
+        # a new generation step starts with no playout; pause/false-interruption
+        # state captured during the previous step must not be treated as belonging
+        # to this one.
+        self._playout_started = False
         self._authorize_event.set()
 
     def _clear_authorization(self) -> None:

@@ -1398,6 +1398,79 @@ async def test_silent_tool_call_pause_state_does_not_leak_into_tool_reply() -> N
     assert false_interruption_events[-1].resumed is True
 
 
+async def test_silent_tool_call_step_does_not_emit_false_interruption_resume() -> None:
+    """A silent tool-call step must not emit a false-interruption resume.
+
+    ``SpeechHandle`` is reused across the generation steps of a single turn. When
+    a silent tool-call step (no audio playout) records a preemptive pause and the
+    false-interruption timer fires *while that step is still active*, the runtime
+    must not report a false-interruption *resume*: nothing was ever played out for
+    this step, so resuming would leak stale per-step playout state. The pause is
+    undone silently and the later tool-reply step resumes playout on its own.
+    """
+    speed = 1
+    actions = FakeActions()
+    actions.add_user_speech(0.1, 0.2, "What's the weather in Tokyo?", stt_delay=0.05)
+
+    # Silent tool-call step: no spoken preamble/audio. The step stays active long
+    # enough that the false-interruption timer fires before it finishes.
+    actions.add_llm(
+        content="",
+        tool_calls=[
+            FunctionToolCall(
+                name="get_weather",
+                arguments='{"location": "Tokyo"}',
+                call_id="1",
+            )
+        ],
+        ttft=0.05,
+        duration=2.0,
+    )
+
+    # VAD-only speech that starts *and ends* during the silent tool-call step,
+    # so the false-interruption timer is scheduled and fires within that step.
+    actions.add_user_speech(0.85, 1.05, "", stt_delay=0.05)
+
+    actions.add_llm(
+        content="The weather in Tokyo is sunny today.",
+        input="The weather in Tokyo is sunny today.",
+        ttft=0.0,
+        duration=0.0,
+    )
+    actions.add_tts(0.5, ttfb=0.0, duration=0.0)
+
+    session = create_session(
+        actions,
+        speed_factor=speed,
+        can_pause_audio=True,
+        turn_handling={"interruption": {"false_interruption_timeout": 0.3 / speed}},
+    )
+    agent = MyAgent()
+
+    agent_state_events: list[AgentStateChangedEvent] = []
+    false_interruption_events: list[AgentFalseInterruptionEvent] = []
+
+    session.on("agent_state_changed", agent_state_events.append)
+    session.on("agent_false_interruption", false_interruption_events.append)
+
+    await asyncio.wait_for(
+        run_session(session, agent, drain_delay=1.5 / speed),
+        timeout=SESSION_TIMEOUT,
+    )
+
+    transitions = [(ev.old_state, ev.new_state) for ev in agent_state_events]
+
+    # The tool reply still plays out: the silent step transitions straight to
+    # speaking when the tool-reply audio starts.
+    assert ("thinking", "speaking") in transitions
+
+    # Before the fix, the false-interruption timer firing during the silent
+    # tool-call step emitted a spurious resume event even though no audio ever
+    # played for that step. After the fix, no such resume is reported because
+    # the current generation step never started playout.
+    assert not any(ev.resumed for ev in false_interruption_events)
+
+
 class FlushMultiSegmentAgent(Agent):
     """Agent whose llm_node flushes the reply into two segments via FlushSentinel."""