livekit · anshulkulhari7 · Jun 15, 2026
@@ -274,6 +274,14 @@ class MetricsReport(TypedDict, total=False):
     User `ChatMessage` only
     """
 
+    first_transcript_after_eos_delay: float
+    """Time between the end of speech (VAD or STT EOS) and the first transcript event
+    (interim or final) received after it. Unlike `transcription_delay` (time to the
+    *final* transcript), this measures the latency to the provider's *first* result.
+
+    User `ChatMessage` only
+    """
+
     on_user_turn_completed_delay: float
     """Time taken to invoke the developer's `Agent.on_user_turn_completed` callback.
 

@@ -104,6 +104,16 @@ class EOUMetrics(_BaseMetrics):
     Set to 0.0 if the end of speech was not detected.
     """
 
+    first_transcript_after_eos_delay: float = 0.0
+    """Time between the end of speech (VAD or STT EOS) and the first transcript event
+    (interim or final) received after it.
+
+    Unlike ``transcription_delay`` (which measures the time to the *final* transcript),
+    this captures how quickly the provider returns its *first* result after the user
+    stops speaking, which is useful for comparing provider latency.
+    Set to 0.0 if the end of speech was not detected.
+    """
+
     on_user_turn_completed_delay: float
     """Time taken to invoke the user's `Agent.on_user_turn_completed` callback."""
 

@@ -82,6 +82,9 @@ def log_metrics(metrics: AgentMetrics, *, logger: logging.Logger | None = None)
             | {
                 "end_of_utterance_delay": round(metrics.end_of_utterance_delay, 2),
                 "transcription_delay": round(metrics.transcription_delay, 2),
+                "first_transcript_after_eos_delay": round(
+                    metrics.first_transcript_after_eos_delay, 2
+                ),
             },
         )
     elif isinstance(metrics, STTMetrics):

@@ -59,6 +59,7 @@
 ATTR_TRANSCRIPT_CONFIDENCE = "lk.transcript_confidence"
 ATTR_TRANSCRIPTION_DELAY = "lk.transcription_delay"
 ATTR_END_OF_TURN_DELAY = "lk.end_of_turn_delay"
+ATTR_FIRST_TRANSCRIPT_AFTER_EOS_DELAY = "lk.first_transcript_after_eos_delay"
 
 # metrics
 ATTR_LLM_METRICS = "lk.llm_metrics"

@@ -2221,6 +2221,7 @@ async def _user_turn_completed_task(
             timestamp=time.time(),
             end_of_utterance_delay=info.end_of_turn_delay or 0.0,
             transcription_delay=info.transcription_delay or 0.0,
+            first_transcript_after_eos_delay=info.first_transcript_after_eos_delay or 0.0,
             on_user_turn_completed_delay=on_user_turn_completed_delay,
             speech_id=speech_handle.id,
             metadata=metadata,
@@ -3899,6 +3900,11 @@ def _init_metrics_from_end_of_turn(self, info: _EndOfTurnInfo) -> llm.MetricsRep
         if info.end_of_turn_delay is not None:
             metrics_report["end_of_turn_delay"] = info.end_of_turn_delay
 
+        if info.first_transcript_after_eos_delay is not None:
+            metrics_report["first_transcript_after_eos_delay"] = (
+                info.first_transcript_after_eos_delay
+            )
+
         return metrics_report
 
     # move them to the end to avoid shadowing the same named modules for mypy

@@ -55,6 +55,7 @@ class _EndOfTurnInfo:
     stopped_speaking_at: float | None
     transcription_delay: float | None
     end_of_turn_delay: float | None
+    first_transcript_after_eos_delay: float | None
 
 
 @dataclass
@@ -175,6 +176,12 @@ def __init__(
         self._last_speaking_time: float | None = None
         self._speech_start_time: float | None = None
 
+        # anchors for the first_transcript_after_eos_delay metric: the time the user's
+        # speech ended (VAD or STT EOS) and the time of the first transcript event that
+        # arrives after that EOS. Both are reset when a new turn starts (START_OF_SPEECH).
+        self._end_of_speech_time: float | None = None
+        self._first_transcript_after_eos_time: float | None = None
+
         # used for manual commit_user_turn
         self._final_transcript_received = asyncio.Event()
         self._final_transcript_confidence: list[float] = []
@@ -723,6 +730,8 @@ def clear_user_turn(self) -> None:
         self._last_final_transcript_time = None
         self._speech_start_time = None
         self._last_speaking_time = None
+        self._end_of_speech_time = None
+        self._first_transcript_after_eos_time = None
         self._vad_speech_started = False
         self._user_turn_committed = False
         self._turn_tracker = _UserTurnTracker()
@@ -835,6 +844,23 @@ def current_transcript(self) -> str:
             return self._audio_transcript + " " + self._audio_interim_transcript
         return self._audio_transcript
 
+    def _mark_end_of_speech(self) -> None:
+        """Record an end-of-speech (VAD or STT EOS) for the ``first_transcript_after_eos``
+        metric and arm a fresh first-transcript measurement from this point.
+        """
+        self._end_of_speech_time = time.time()
+        self._first_transcript_after_eos_time = None
+
+    def _mark_first_transcript_after_eos(self) -> None:
+        """Record the arrival time of the first transcript event after end of speech.
+
+        Used to compute ``first_transcript_after_eos_delay``. The anchor is only set once
+        per turn — after an end-of-speech has been observed and before the next
+        START_OF_SPEECH clears it — so subsequent transcripts do not move it.
+        """
+        if self._end_of_speech_time is not None and self._first_transcript_after_eos_time is None:
+            self._first_transcript_after_eos_time = time.time()
+
     async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
         # Collect provider-known STT ids for this user turn. The actual attribute
         # is written once when the user_turn span ends (see _on_end_of_turn), to
@@ -904,6 +930,8 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
             if not transcript:
                 return
 
+            self._mark_first_transcript_after_eos()
+
             self._hooks.on_final_transcript(
                 ev,
                 speaking=self._speaking
@@ -971,6 +999,8 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
             if not transcript:
                 return
 
+            self._mark_first_transcript_after_eos()
+
             logger.debug(
                 "received user preflight transcript",
                 extra={"user_transcript": transcript, "language": self._last_language},
@@ -1003,6 +1033,8 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
                 if self._vad or self._turn_detection_mode == "stt"
                 else None,
             )
+            if ev.alternatives and ev.alternatives[0].text:
+                self._mark_first_transcript_after_eos()
             self._audio_interim_transcript = ev.alternatives[0].text
 
         elif ev.type == stt.SpeechEventType.END_OF_SPEECH and self._turn_detection_mode == "stt":
@@ -1031,6 +1063,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
                     )
 
             self._speaking = False
+            self._mark_end_of_speech()
             self._user_turn_committed = True
             if not self._vad or self._last_speaking_time is None:
                 # vad disabled or missed a speech, use stt timestamp
@@ -1089,6 +1122,7 @@ async def _on_vad_event(self, ev: vad.VADEvent) -> None:
 
             self._vad_speech_started = False
             self._speaking = False
+            self._mark_end_of_speech()
 
             if self._vad_base_turn_detection or (
                 self._turn_detection_mode == "stt" and self._user_turn_committed
@@ -1127,6 +1161,8 @@ async def _bounce_eou_task(
             last_speaking_time: float | None = None,
             last_final_transcript_time: float | None = None,
             speech_start_time: float | None = None,
+            end_of_speech_time: float | None = None,
+            first_transcript_after_eos_time: float | None = None,
         ) -> None:
             endpointing_delay = self._endpointing.min_delay
             user_turn_span = self._ensure_user_turn_span()
@@ -1201,6 +1237,7 @@ async def _bounce_eou_task(
             stopped_speaking_at = None
             transcription_delay = None
             end_of_turn_delay = None
+            first_transcript_after_eos_delay = None
 
             # sometimes, we can't calculate the metrics because VAD was unreliable.
             # in this case, we just ignore the calculation, it's better than providing likely wrong values
@@ -1214,6 +1251,13 @@ async def _bounce_eou_task(
                 transcription_delay = max(last_final_transcript_time - last_speaking_time, 0)
                 end_of_turn_delay = time.time() - last_speaking_time
 
+            # first transcript after end-of-speech: independent of the VAD speaking
+            # anchors above, only needs the EOS time and the first post-EOS transcript.
+            if end_of_speech_time is not None and first_transcript_after_eos_time is not None:
+                first_transcript_after_eos_delay = max(
+                    first_transcript_after_eos_time - end_of_speech_time, 0
+                )
+
             committed = self._hooks.on_end_of_turn(
                 _EndOfTurnInfo(
                     skip_reply=skip_reply,
@@ -1223,6 +1267,7 @@ async def _bounce_eou_task(
                     end_of_turn_delay=end_of_turn_delay,
                     started_speaking_at=started_speaking_at,
                     stopped_speaking_at=stopped_speaking_at,
+                    first_transcript_after_eos_delay=first_transcript_after_eos_delay,
                 )
             )
             if committed:
@@ -1232,6 +1277,9 @@ async def _bounce_eou_task(
                         trace_types.ATTR_TRANSCRIPT_CONFIDENCE: confidence_avg,
                         trace_types.ATTR_TRANSCRIPTION_DELAY: transcription_delay or 0,
                         trace_types.ATTR_END_OF_TURN_DELAY: end_of_turn_delay or 0,
+                        trace_types.ATTR_FIRST_TRANSCRIPT_AFTER_EOS_DELAY: (
+                            first_transcript_after_eos_delay or 0
+                        ),
                     }
                 )
                 if self._stt_request_ids:
@@ -1254,6 +1302,12 @@ async def _bounce_eou_task(
                     self._vad_speech_started = False
                     self._last_speaking_time = None
 
+                # only reset the EOS anchors if no newer end-of-speech happened in
+                # the meantime (mirrors the last_speaking_time guard above)
+                if self._end_of_speech_time == end_of_speech_time:
+                    self._end_of_speech_time = None
+                    self._first_transcript_after_eos_time = None
+
             self._user_turn_committed = False
 
         if self._end_of_turn_task is not None:
@@ -1266,6 +1320,8 @@ async def _bounce_eou_task(
                 self._last_speaking_time,
                 self._last_final_transcript_time,
                 self._user_turn_start,
+                self._end_of_speech_time,
+                self._first_transcript_after_eos_time,
             )
         )