Merge pull request #266 from askui/feat/conversation_duration

philipph-askui · web-flow · commit 76ff93c06456 · 2026-04-15T13:56:44.000+02:00
feat: add per-conversation duration to Html reports
diff --git a/src/askui/agent_base.py b/src/askui/agent_base.py
@@ -10,7 +10,7 @@
 from typing_extensions import Self
 
 from askui.agent_settings import AgentSettings
-from askui.callbacks import ConversationCallback, UsageTrackingCallback
+from askui.callbacks import ConversationCallback, ConversationStatisticsCallback
 from askui.container import telemetry
 from askui.locators.locators import Locator
 from askui.models.shared.agent_message_param import MessageParam
@@ -78,7 +78,7 @@ def __init__(
         speakers = Speakers()
         _callbacks = list(callbacks or [])
         _callbacks.append(
-            UsageTrackingCallback(
+            ConversationStatisticsCallback(
                 reporter=self._reporter,
                 pricing=self._vlm_provider.pricing,
             )
diff --git a/src/askui/callbacks/__init__.py b/src/askui/callbacks/__init__.py
@@ -1,7 +1,7 @@
 from .conversation_callback import ConversationCallback
-from .usage_tracking_callback import UsageTrackingCallback
+from .conversation_statistics_callback import ConversationStatisticsCallback
 
 __all__ = [
     "ConversationCallback",
-    "UsageTrackingCallback",
+    "ConversationStatisticsCallback",
 ]
diff --git a/src/askui/callbacks/conversation_statistics_callback.py b/src/askui/callbacks/conversation_statistics_callback.py
@@ -1,7 +1,13 @@
-"""Callback for tracking token usage and reporting usage summaries."""
+"""Callback for tracking per-conversation statistics (token usage, timing).
+
+Emits a `UsageSummary` (with per-conversation and per-step breakdowns,
+including start/end timestamps for each conversation) to a reporter when the
+conversation ends.
+"""
 
 from __future__ import annotations
 
+from datetime import datetime, timezone
 from typing import TYPE_CHECKING
 
 from opentelemetry import trace
@@ -172,15 +178,35 @@ class StepUsageSummary(UsageSummary):
 
 
 class ConversationUsageSummary(UsageSummary):
-    """Usage summary for one conversation including per-step breakdown."""
+    """Usage summary for one conversation including per-step breakdown.
+
+    Args:
+        conversation_index (int): 1-based index of the conversation within the
+            current agent lifecycle.
+        conversation_id (str): Unique identifier of the conversation.
+        step_summaries (list[StepUsageSummary]): Per-step usage summaries.
+        started_at (datetime | None): UTC timestamp captured at
+            `on_conversation_start`. `None` if timing was not tracked.
+        ended_at (datetime | None): UTC timestamp captured at
+            `on_conversation_end`. `None` if timing was not tracked.
+    """
 
     conversation_index: int
     conversation_id: str
     step_summaries: list[StepUsageSummary] = Field(default_factory=list)
+    started_at: datetime | None = None
+    ended_at: datetime | None = None
+
 
+class ConversationStatisticsCallback(ConversationCallback):
+    """Tracks per-conversation statistics (token usage per step and wall-clock
+    timing) and reports a summary at conversation end.
 
-class UsageTrackingCallback(ConversationCallback):
-    """Tracks token usage per step and reports a summary at conversation end.
+    The reported `UsageSummary` contains, for each conversation, the raw
+    ``started_at`` and ``ended_at`` UTC timestamps alongside token usage.
+    Downstream consumers (e.g. `SimpleHtmlReporter`) are responsible for
+    deriving human-readable durations from those timestamps so the raw values
+    remain available for other uses.
 
     Args:
         reporter: Reporter to write the final usage summary to.
@@ -199,12 +225,14 @@ def __init__(
         self._per_conversation_summaries: list[ConversationUsageSummary] = []
         self._per_step_summaries: list[StepUsageSummary] = []
         self._conversation_index: int = 0
+        self._conversation_started_at: datetime | None = None
 
     @override
     def on_conversation_start(self, conversation: Conversation) -> None:
         self._per_conversation_usage = UsageSummary.create_from(self._summary)
         self._per_step_summaries = []
         self._conversation_index += 1
+        self._conversation_started_at = datetime.now(tz=timezone.utc)
 
     @override
     def on_step_end(
@@ -237,9 +265,12 @@ def on_conversation_end(self, conversation: Conversation) -> None:
         generated_steps: list[StepUsageSummary] = [
             step_summary.generate() for step_summary in self._per_step_summaries
         ]
+        ended_at = datetime.now(tz=timezone.utc)
         conversation_summary = self._create_conversation_summary(
             conversation=conversation,
             generated_step_summaries=generated_steps,
+            started_at=self._conversation_started_at,
+            ended_at=ended_at,
         )
         self._per_conversation_summaries.append(conversation_summary)
         self._summary.per_conversation_summaries = list(
@@ -275,11 +306,15 @@ def _create_conversation_summary(
         self,
         conversation: Conversation,
         generated_step_summaries: list[StepUsageSummary],
+        started_at: datetime | None = None,
+        ended_at: datetime | None = None,
     ) -> ConversationUsageSummary:
         conversation_summary = ConversationUsageSummary(
             conversation_index=self._conversation_index,
             conversation_id=conversation.conversation_id,
             step_summaries=generated_step_summaries,
+            started_at=started_at,
+            ended_at=ended_at,
             input_tokens=self._per_conversation_usage.input_tokens,
             output_tokens=self._per_conversation_usage.output_tokens,
             cache_creation_input_tokens=(
diff --git a/src/askui/reporting.py b/src/askui/reporting.py
@@ -21,7 +21,10 @@
 if TYPE_CHECKING:
     from PIL import Image
 
-    from askui.callbacks.usage_tracking_callback import UsageSummary
+    from askui.callbacks.conversation_statistics_callback import (
+        ConversationUsageSummary,
+        UsageSummary,
+    )
 
 
 def normalize_to_pil_images(
@@ -37,6 +40,27 @@ def normalize_to_pil_images(
     return [image]
 
 
+def _format_duration(seconds: float) -> str:
+    """Format a duration given in seconds as ``HH:MM:SS`` or
+    ``HH:MM:SS.mmm`` for sub-second precision.
+
+    Used by `SimpleHtmlReporter` to render both the overall execution time and
+    per-conversation durations consistently.
+    """
+    total_seconds = max(float(seconds), 0.0)
+    whole_seconds = int(total_seconds)
+    millis = int(round((total_seconds - whole_seconds) * 1000))
+    if millis == 1000:
+        whole_seconds += 1
+        millis = 0
+    hours, remainder = divmod(whole_seconds, 3600)
+    minutes, secs = divmod(remainder, 60)
+    base = f"{hours:02d}:{minutes:02d}:{secs:02d}"
+    if whole_seconds == 0 and millis > 0:
+        return f"{base}.{millis:03d}"
+    return base
+
+
 def truncate_base64_images(content: Any) -> Any:
     """Replace base64 image data with a placeholder to keep reports readable.
 
@@ -1003,13 +1027,17 @@ def generate(self) -> None:
                         </p>
                         <div class="usage-breakdown-list">
                             {% for conversation_usage in usage_summary.per_conversation_summaries %}
+                            {% set conversation_duration = format_conversation_duration(conversation_usage) %}
                             <details class="usage-breakdown-item">
                                 <summary>
                                     <span class="usage-breakdown-title">
                                         Conversation #{{ conversation_usage.conversation_index }}
                                     </span>
                                     <span class="usage-breakdown-meta">
                                         {{ conversation_usage.step_summaries | length }} step(s),
+                                        {% if conversation_duration is not none %}
+                                            Duration: {{ conversation_duration }},
+                                        {% endif %}
                                         Input {{ "{:,}".format(conversation_usage.input_tokens or 0) }},
                                         Output {{ "{:,}".format(conversation_usage.output_tokens or 0) }},
                                         Cache Create {{ "{:,}".format(conversation_usage.cache_creation_input_tokens or 0) }},
@@ -1026,6 +1054,9 @@ def generate(self) -> None:
                                     <table class="nested-table">
                                         <tr>
                                             <th>Conversation ID</th>
+                                            {% if conversation_duration is not none %}
+                                            <th>Duration</th>
+                                            {% endif %}
                                             <th>Input Tokens</th>
                                             <th>Output Tokens</th>
                                             <th>Cache Create</th>
@@ -1036,6 +1067,9 @@ def generate(self) -> None:
                                         </tr>
                                         <tr class="system">
                                             <td class="mono">{{ conversation_usage.conversation_id }}</td>
+                                            {% if conversation_duration is not none %}
+                                            <td>{{ conversation_duration }}</td>
+                                            {% endif %}
                                             <td>{{ "{:,}".format(conversation_usage.input_tokens or 0) }}</td>
                                             <td>{{ "{:,}".format(conversation_usage.output_tokens or 0) }}</td>
                                             <td>{{ "{:,}".format(conversation_usage.cache_creation_input_tokens or 0) }}</td>
@@ -1141,10 +1175,28 @@ def generate(self) -> None:
         end_time = datetime.now(tz=timezone.utc)
         execution_time_formatted: str | None = None
         if self._start_time is not None:
-            total_secs = int((end_time - self._start_time).total_seconds())
-            hours, remainder = divmod(total_secs, 3600)
-            minutes, secs = divmod(remainder, 60)
-            execution_time_formatted = f"{hours:02d}:{minutes:02d}:{secs:02d}"
+            execution_time_formatted = _format_duration(
+                (end_time - self._start_time).total_seconds()
+            )
+
+        def _format_conversation_duration(
+            conversation_usage: "ConversationUsageSummary",
+        ) -> str | None:
+            """Derive the formatted conversation duration from stored timestamps.
+
+            Returns ``None`` if either ``started_at`` or ``ended_at`` is missing
+            so the template can skip rendering.
+            """
+            if (
+                conversation_usage.started_at is None
+                or conversation_usage.ended_at is None
+            ):
+                return None
+            return _format_duration(
+                (
+                    conversation_usage.ended_at - conversation_usage.started_at
+                ).total_seconds()
+            )
 
         html = template.render(
             timestamp=end_time,
@@ -1153,6 +1205,7 @@ def generate(self) -> None:
             usage_summary=self.usage_summary,
             cache_original_usage=self.cache_original_usage,
             execution_time_formatted=execution_time_formatted,
+            format_conversation_duration=_format_conversation_duration,
         )
 
         report_path = (
diff --git a/tests/unit/model_providers/test_model_pricing.py b/tests/unit/model_providers/test_model_pricing.py
@@ -4,9 +4,9 @@
 
 import pytest
 
-from askui.callbacks.usage_tracking_callback import (
+from askui.callbacks.conversation_statistics_callback import (
+    ConversationStatisticsCallback,
     UsageSummary,
-    UsageTrackingCallback,
 )
 from askui.models.shared.agent_message_param import UsageParam
 from askui.speaker.speaker import SpeakerResult
@@ -98,12 +98,12 @@ def _assert_close(
     assert abs(actual - expected) <= tolerance
 
 
-class TestUsageTrackingCallbackCost:
+class TestConversationStatisticsCallbackCost:
     def _make_callback(
         self, pricing: ModelPricing | None = None
-    ) -> tuple[UsageTrackingCallback, MagicMock]:
+    ) -> tuple[ConversationStatisticsCallback, MagicMock]:
         reporter = MagicMock()
-        callback = UsageTrackingCallback(reporter=reporter, pricing=pricing)
+        callback = ConversationStatisticsCallback(reporter=reporter, pricing=pricing)
         return callback, reporter
 
     @pytest.mark.parametrize(
@@ -245,6 +245,9 @@ def test_tracks_per_step_per_conversation_and_total_usage(self) -> None:
         assert per_conversation_summary.output_tokens == 30
         _assert_close(per_conversation_summary.total_cost, 0.0009)
         assert len(per_conversation_summary.step_summaries) == 2
+        assert per_conversation_summary.started_at is not None
+        assert per_conversation_summary.ended_at is not None
+        assert per_conversation_summary.ended_at >= per_conversation_summary.started_at
 
         first_step = per_conversation_summary.step_summaries[0]
         assert first_step.step_index == 0
@@ -301,6 +304,12 @@ def test_accumulates_multiple_conversations(self) -> None:
         assert len(summary.per_conversation_summaries) == 2
         assert summary.per_conversation_summaries[0].conversation_id == "conversation-1"
         assert summary.per_conversation_summaries[1].conversation_id == "conversation-2"
+        for per_conversation_summary in summary.per_conversation_summaries:
+            assert per_conversation_summary.started_at is not None
+            assert per_conversation_summary.ended_at is not None
+            assert (
+                per_conversation_summary.ended_at >= per_conversation_summary.started_at
+            )
 
     def test_includes_cache_costs_from_provider_pricing(self) -> None:
         pricing = ModelPricing(

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`from .conversation_callback import ConversationCallback`
`2`		`-from .usage_tracking_callback import UsageTrackingCallback`
	`2`	`+from .conversation_statistics_callback import ConversationStatisticsCallback`
`3`	`3`
`4`	`4`	`__all__ = [`
`5`	`5`	`"ConversationCallback",`
`6`		`- "UsageTrackingCallback",`
	`6`	`+ "ConversationStatisticsCallback",`
`7`	`7`	`]`