Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -1728,7 +1728,11 @@ def _on_input_speech_stopped(self, ev: llm.InputSpeechStoppedEvent) -> None:

def _on_input_audio_transcription_completed(self, ev: llm.InputTranscriptionCompleted) -> None:
self._session._user_input_transcribed(
UserInputTranscribedEvent(transcript=ev.transcript, is_final=ev.is_final)
UserInputTranscribedEvent(
transcript=ev.transcript,
is_final=ev.is_final,
item_id=ev.item_id,
)
)

if ev.is_final:
Expand Down
2 changes: 2 additions & 0 deletions livekit-agents/livekit/agents/voice/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,8 @@ class UserInputTranscribedEvent(BaseModel):
type: Literal["user_input_transcribed"] = "user_input_transcribed"
transcript: str
is_final: bool
item_id: str | None = None
"""Provider-specific ID for the transcribed input item, when available."""
speaker_id: str | None = None
language: LanguageCode | None = None
created_at: float = Field(default_factory=time.time)
Expand Down
29 changes: 26 additions & 3 deletions tests/test_agent_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@
inference,
vad,
)
from livekit.agents.llm import (
FunctionToolCall,
)
from livekit.agents.llm import FunctionToolCall, InputTranscriptionCompleted
from livekit.agents.llm.chat_context import ChatContext, ChatMessage
from livekit.agents.stt import SpeechData, SpeechEvent, SpeechEventType
from livekit.agents.utils import aio
Expand Down Expand Up @@ -92,6 +90,31 @@ async def on_user_turn_completed(self, turn_ctx: ChatContext, new_message: ChatM
SESSION_TIMEOUT = 60.0


def test_realtime_user_input_transcription_preserves_item_id() -> None:
captured_events: list[UserInputTranscribedEvent] = []

class DummySession:
def _user_input_transcribed(self, ev: UserInputTranscribedEvent) -> None:
captured_events.append(ev)

activity = object.__new__(AgentActivity)
activity._session = DummySession()

AgentActivity._on_input_audio_transcription_completed(
activity,
InputTranscriptionCompleted(
item_id="item_123",
transcript="hello",
is_final=False,
),
)

assert len(captured_events) == 1
assert captured_events[0].transcript == "hello"
assert captured_events[0].is_final is False
assert captured_events[0].item_id == "item_123"


async def test_events_and_metrics() -> None:
speed = 1
actions = FakeActions()
Expand Down