Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -1727,8 +1727,16 @@ def _on_input_speech_stopped(self, ev: llm.InputSpeechStoppedEvent) -> None:
)

def _on_input_audio_transcription_completed(self, ev: llm.InputTranscriptionCompleted) -> None:
# `item_id` is threaded through so every interim/final transcript of the
# same utterance carries the same id — consumers that need per-utterance
# dedup (e.g. "render a `user-speech-received` placeholder exactly once")
# can correlate via the provider-agnostic event surface instead of
# dropping into provider-specific `openai_server_event_received`, which
# isn't portable across realtime backends (#6109).
self._session._user_input_transcribed(
UserInputTranscribedEvent(transcript=ev.transcript, is_final=ev.is_final)
UserInputTranscribedEvent(
transcript=ev.transcript, is_final=ev.is_final, item_id=ev.item_id
)
)
Comment on lines +1737 to 1740

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 Remote session transport does not forward item_id

The _on_user_input_transcribed handler in livekit-agents/livekit/agents/voice/remote_session.py:466-474 constructs the protobuf UserInputTranscribed message with only transcript and is_final — the new item_id field is not forwarded. This means remote session consumers won't receive the item_id for dedup purposes. This is not a regression (the protobuf schema would need a separate update), but it limits the utility of the feature for remote/distributed deployments.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


if ev.is_final:
Expand Down
7 changes: 7 additions & 0 deletions livekit-agents/livekit/agents/voice/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,13 @@ class UserInputTranscribedEvent(BaseModel):
is_final: bool
speaker_id: str | None = None
language: LanguageCode | None = None
item_id: str | None = None
"""Stable id identifying the user utterance this transcript belongs to. On
realtime models, every interim and final ``UserInputTranscribedEvent`` for
a single utterance shares the same ``item_id``, so consumers can dedup
interim transcripts and react exactly once per utterance using the
provider-agnostic event surface. ``None`` on STT paths where no upstream
item id exists. See https://github.com/livekit/agents/issues/6109."""
created_at: float = Field(default_factory=time.time)


Expand Down
70 changes: 70 additions & 0 deletions tests/test_user_input_transcribed_event.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Unit tests for the `item_id` field on ``UserInputTranscribedEvent``.

Pins the contract that the realtime-path transcription event surface exposes
the upstream ``InputTranscriptionCompleted.item_id`` so consumers can dedup
interim transcripts of a single utterance without dropping into
provider-specific raw events.

Regression target: https://github.com/livekit/agents/issues/6109
"""

from __future__ import annotations

import pytest

from livekit.agents.llm.realtime import InputTranscriptionCompleted
from livekit.agents.voice.events import UserInputTranscribedEvent

pytestmark = pytest.mark.unit


def test_user_input_transcribed_event_carries_item_id() -> None:
"""The public event schema accepts ``item_id`` and round-trips it."""
ev = UserInputTranscribedEvent(
transcript="hello world",
is_final=False,
item_id="item_abc123",
)
assert ev.item_id == "item_abc123"


def test_user_input_transcribed_event_item_id_defaults_to_none() -> None:
"""``item_id`` is optional — STT paths that have no upstream item id can
omit the field and consumers reading ``ev.item_id`` see ``None``."""
ev = UserInputTranscribedEvent(transcript="hello world", is_final=True)
assert ev.item_id is None


def test_user_input_transcribed_event_serialises_item_id() -> None:
"""``model_dump`` includes the ``item_id`` field — important because
downstream consumers (e.g. the host transport in ``test_session_host.py``)
may serialise this event for cross-process delivery."""
ev = UserInputTranscribedEvent(
transcript="hi", is_final=True, item_id="item_xyz"
)
dumped = ev.model_dump()
assert dumped["item_id"] == "item_xyz"


def test_input_transcription_completed_item_id_can_thread_to_event() -> None:
"""Realtime path: every interim/final ``UserInputTranscribedEvent`` for a
single utterance shares the upstream ``InputTranscriptionCompleted.item_id``.

Mirrors the data flow inside
``AgentActivity._on_input_audio_transcription_completed`` without
instantiating the full activity — the contract under test is
"the field exists and can carry the value", not the activity's plumbing.
"""
upstream = InputTranscriptionCompleted(
item_id="item_realtime_42",
transcript="hello world",
is_final=True,
)

emitted = UserInputTranscribedEvent(
transcript=upstream.transcript,
is_final=upstream.is_final,
item_id=upstream.item_id,
)

assert emitted.item_id == upstream.item_id == "item_realtime_42"
Loading