Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions livekit-agents/livekit/agents/llm/chat_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,14 @@ class MetricsReport(TypedDict, total=False):
User `ChatMessage` only
"""

first_transcript_after_eos_delay: float
"""Time between the end of speech (VAD or STT EOS) and the first transcript event
(interim or final) received after it. Unlike `transcription_delay` (time to the
*final* transcript), this measures the latency to the provider's *first* result.

User `ChatMessage` only
"""

on_user_turn_completed_delay: float
"""Time taken to invoke the developer's `Agent.on_user_turn_completed` callback.

Expand Down
10 changes: 10 additions & 0 deletions livekit-agents/livekit/agents/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,16 @@ class EOUMetrics(_BaseMetrics):
Set to 0.0 if the end of speech was not detected.
"""

first_transcript_after_eos_delay: float = 0.0
"""Time between the end of speech (VAD or STT EOS) and the first transcript event
(interim or final) received after it.

Unlike ``transcription_delay`` (which measures the time to the *final* transcript),
this captures how quickly the provider returns its *first* result after the user
stops speaking, which is useful for comparing provider latency.
Set to 0.0 if the end of speech was not detected.
"""

on_user_turn_completed_delay: float
"""Time taken to invoke the user's `Agent.on_user_turn_completed` callback."""

Expand Down
3 changes: 3 additions & 0 deletions livekit-agents/livekit/agents/metrics/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ def log_metrics(metrics: AgentMetrics, *, logger: logging.Logger | None = None)
| {
"end_of_utterance_delay": round(metrics.end_of_utterance_delay, 2),
"transcription_delay": round(metrics.transcription_delay, 2),
"first_transcript_after_eos_delay": round(
metrics.first_transcript_after_eos_delay, 2
),
},
)
elif isinstance(metrics, STTMetrics):
Expand Down
1 change: 1 addition & 0 deletions livekit-agents/livekit/agents/telemetry/trace_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
ATTR_TRANSCRIPT_CONFIDENCE = "lk.transcript_confidence"
ATTR_TRANSCRIPTION_DELAY = "lk.transcription_delay"
ATTR_END_OF_TURN_DELAY = "lk.end_of_turn_delay"
ATTR_FIRST_TRANSCRIPT_AFTER_EOS_DELAY = "lk.first_transcript_after_eos_delay"

# metrics
ATTR_LLM_METRICS = "lk.llm_metrics"
Expand Down
6 changes: 6 additions & 0 deletions livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -2221,6 +2221,7 @@ async def _user_turn_completed_task(
timestamp=time.time(),
end_of_utterance_delay=info.end_of_turn_delay or 0.0,
transcription_delay=info.transcription_delay or 0.0,
first_transcript_after_eos_delay=info.first_transcript_after_eos_delay or 0.0,
on_user_turn_completed_delay=on_user_turn_completed_delay,
speech_id=speech_handle.id,
metadata=metadata,
Expand Down Expand Up @@ -3899,6 +3900,11 @@ def _init_metrics_from_end_of_turn(self, info: _EndOfTurnInfo) -> llm.MetricsRep
if info.end_of_turn_delay is not None:
metrics_report["end_of_turn_delay"] = info.end_of_turn_delay

if info.first_transcript_after_eos_delay is not None:
metrics_report["first_transcript_after_eos_delay"] = (
info.first_transcript_after_eos_delay
)

return metrics_report

# move them to the end to avoid shadowing the same named modules for mypy
Expand Down
56 changes: 56 additions & 0 deletions livekit-agents/livekit/agents/voice/audio_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class _EndOfTurnInfo:
stopped_speaking_at: float | None
transcription_delay: float | None
end_of_turn_delay: float | None
first_transcript_after_eos_delay: float | None


@dataclass
Expand Down Expand Up @@ -175,6 +176,12 @@ def __init__(
self._last_speaking_time: float | None = None
self._speech_start_time: float | None = None

# anchors for the first_transcript_after_eos_delay metric: the time the user's
# speech ended (VAD or STT EOS) and the time of the first transcript event that
# arrives after that EOS. Both are reset when a new turn starts (START_OF_SPEECH).
self._end_of_speech_time: float | None = None
self._first_transcript_after_eos_time: float | None = None

# used for manual commit_user_turn
self._final_transcript_received = asyncio.Event()
self._final_transcript_confidence: list[float] = []
Expand Down Expand Up @@ -723,6 +730,8 @@ def clear_user_turn(self) -> None:
self._last_final_transcript_time = None
self._speech_start_time = None
self._last_speaking_time = None
self._end_of_speech_time = None
self._first_transcript_after_eos_time = None
self._vad_speech_started = False
self._user_turn_committed = False
self._turn_tracker = _UserTurnTracker()
Expand Down Expand Up @@ -835,6 +844,23 @@ def current_transcript(self) -> str:
return self._audio_transcript + " " + self._audio_interim_transcript
return self._audio_transcript

def _mark_end_of_speech(self) -> None:
"""Record an end-of-speech (VAD or STT EOS) for the ``first_transcript_after_eos``
metric and arm a fresh first-transcript measurement from this point.
"""
self._end_of_speech_time = time.time()
self._first_transcript_after_eos_time = None

def _mark_first_transcript_after_eos(self) -> None:
"""Record the arrival time of the first transcript event after end of speech.

Used to compute ``first_transcript_after_eos_delay``. The anchor is only set once
per turn — after an end-of-speech has been observed and before the next
START_OF_SPEECH clears it — so subsequent transcripts do not move it.
"""
if self._end_of_speech_time is not None and self._first_transcript_after_eos_time is None:
self._first_transcript_after_eos_time = time.time()

async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
# Collect provider-known STT ids for this user turn. The actual attribute
# is written once when the user_turn span ends (see _on_end_of_turn), to
Expand Down Expand Up @@ -904,6 +930,8 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
if not transcript:
return

self._mark_first_transcript_after_eos()

self._hooks.on_final_transcript(
ev,
speaking=self._speaking
Expand Down Expand Up @@ -971,6 +999,8 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
if not transcript:
return

self._mark_first_transcript_after_eos()

logger.debug(
"received user preflight transcript",
extra={"user_transcript": transcript, "language": self._last_language},
Expand Down Expand Up @@ -1003,6 +1033,8 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
if self._vad or self._turn_detection_mode == "stt"
else None,
)
if ev.alternatives and ev.alternatives[0].text:
self._mark_first_transcript_after_eos()
self._audio_interim_transcript = ev.alternatives[0].text

elif ev.type == stt.SpeechEventType.END_OF_SPEECH and self._turn_detection_mode == "stt":
Expand Down Expand Up @@ -1031,6 +1063,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
)

self._speaking = False
self._mark_end_of_speech()
self._user_turn_committed = True
if not self._vad or self._last_speaking_time is None:
# vad disabled or missed a speech, use stt timestamp
Expand Down Expand Up @@ -1089,6 +1122,7 @@ async def _on_vad_event(self, ev: vad.VADEvent) -> None:

self._vad_speech_started = False
self._speaking = False
self._mark_end_of_speech()

if self._vad_base_turn_detection or (
self._turn_detection_mode == "stt" and self._user_turn_committed
Expand Down Expand Up @@ -1127,6 +1161,8 @@ async def _bounce_eou_task(
last_speaking_time: float | None = None,
last_final_transcript_time: float | None = None,
speech_start_time: float | None = None,
end_of_speech_time: float | None = None,
first_transcript_after_eos_time: float | None = None,
) -> None:
endpointing_delay = self._endpointing.min_delay
user_turn_span = self._ensure_user_turn_span()
Expand Down Expand Up @@ -1201,6 +1237,7 @@ async def _bounce_eou_task(
stopped_speaking_at = None
transcription_delay = None
end_of_turn_delay = None
first_transcript_after_eos_delay = None

# sometimes, we can't calculate the metrics because VAD was unreliable.
# in this case, we just ignore the calculation, it's better than providing likely wrong values
Expand All @@ -1214,6 +1251,13 @@ async def _bounce_eou_task(
transcription_delay = max(last_final_transcript_time - last_speaking_time, 0)
end_of_turn_delay = time.time() - last_speaking_time

# first transcript after end-of-speech: independent of the VAD speaking
# anchors above, only needs the EOS time and the first post-EOS transcript.
if end_of_speech_time is not None and first_transcript_after_eos_time is not None:
first_transcript_after_eos_delay = max(
first_transcript_after_eos_time - end_of_speech_time, 0
)

committed = self._hooks.on_end_of_turn(
_EndOfTurnInfo(
skip_reply=skip_reply,
Expand All @@ -1223,6 +1267,7 @@ async def _bounce_eou_task(
end_of_turn_delay=end_of_turn_delay,
started_speaking_at=started_speaking_at,
stopped_speaking_at=stopped_speaking_at,
first_transcript_after_eos_delay=first_transcript_after_eos_delay,
)
)
if committed:
Expand All @@ -1232,6 +1277,9 @@ async def _bounce_eou_task(
trace_types.ATTR_TRANSCRIPT_CONFIDENCE: confidence_avg,
trace_types.ATTR_TRANSCRIPTION_DELAY: transcription_delay or 0,
trace_types.ATTR_END_OF_TURN_DELAY: end_of_turn_delay or 0,
trace_types.ATTR_FIRST_TRANSCRIPT_AFTER_EOS_DELAY: (
first_transcript_after_eos_delay or 0
),
}
)
if self._stt_request_ids:
Expand All @@ -1254,6 +1302,12 @@ async def _bounce_eou_task(
self._vad_speech_started = False
self._last_speaking_time = None

# only reset the EOS anchors if no newer end-of-speech happened in
# the meantime (mirrors the last_speaking_time guard above)
if self._end_of_speech_time == end_of_speech_time:
self._end_of_speech_time = None
self._first_transcript_after_eos_time = None

self._user_turn_committed = False

if self._end_of_turn_task is not None:
Expand All @@ -1266,6 +1320,8 @@ async def _bounce_eou_task(
self._last_speaking_time,
self._last_final_transcript_time,
self._user_turn_start,
self._end_of_speech_time,
self._first_transcript_after_eos_time,
)
)

Expand Down
Loading