@@ -2,14 +2,307 @@ package evaluation
22
33import (
44 "cmp"
5+ "context"
56 "encoding/json"
67 "fmt"
78 "os"
89 "path/filepath"
10+ "strings"
11+ "time"
912
13+ "github.com/docker/cagent/pkg/chat"
1014 "github.com/docker/cagent/pkg/session"
15+ "github.com/docker/cagent/pkg/tools"
1116)
1217
18+ // SaveRunSessions saves all eval sessions to a SQLite database file.
19+ // The database follows the same schema as the main session store,
20+ // allowing the sessions to be loaded and inspected using standard session tools.
21+ func SaveRunSessions (ctx context.Context , run * EvalRun , outputDir string ) (string , error ) {
22+ dbPath := filepath .Join (outputDir , run .Name + ".db" )
23+
24+ // Create output directory if needed
25+ if err := os .MkdirAll (outputDir , 0o755 ); err != nil {
26+ return "" , fmt .Errorf ("creating output directory: %w" , err )
27+ }
28+
29+ // Create a new SQLite session store for this eval run
30+ store , err := session .NewSQLiteSessionStore (dbPath )
31+ if err != nil {
32+ return "" , fmt .Errorf ("creating session store: %w" , err )
33+ }
34+ defer func () {
35+ if closer , ok := store .(interface { Close () error }); ok {
36+ _ = closer .Close ()
37+ }
38+ }()
39+
40+ // Save each result's session to the database
41+ for i := range run .Results {
42+ result := & run .Results [i ]
43+ if result .Session == nil {
44+ continue
45+ }
46+
47+ if err := store .AddSession (ctx , result .Session ); err != nil {
48+ return "" , fmt .Errorf ("saving session for %q: %w" , result .Title , err )
49+ }
50+ }
51+
52+ return dbPath , nil
53+ }
54+
55+ // SessionFromEvents reconstructs a session from raw container output events.
56+ // This parses the JSON events emitted by cagent --json and builds a session
57+ // with the conversation history.
58+ func SessionFromEvents (events []map [string ]any , title , question string ) * session.Session {
59+ sess := session .New (
60+ session .WithTitle (title ),
61+ session .WithToolsApproved (true ),
62+ )
63+
64+ // Add the user question as the first message
65+ if question != "" {
66+ sess .AddMessage (session .UserMessage (question ))
67+ }
68+
69+ // Track current assistant message being built
70+ var currentContent strings.Builder
71+ var currentReasoningContent strings.Builder
72+ var currentToolCalls []tools.ToolCall
73+ var currentToolDefinitions []tools.Tool
74+ var currentAgentName string
75+ var currentModel string
76+ var currentUsage * chat.Usage
77+ var currentCost float64
78+
79+ // Helper to flush current assistant message
80+ flushAssistantMessage := func () {
81+ if currentContent .Len () > 0 || currentReasoningContent .Len () > 0 || len (currentToolCalls ) > 0 {
82+ msg := & session.Message {
83+ AgentName : currentAgentName ,
84+ Message : chat.Message {
85+ Role : chat .MessageRoleAssistant ,
86+ Content : currentContent .String (),
87+ ReasoningContent : currentReasoningContent .String (),
88+ ToolCalls : currentToolCalls ,
89+ ToolDefinitions : currentToolDefinitions ,
90+ CreatedAt : time .Now ().Format (time .RFC3339 ),
91+ Model : currentModel ,
92+ Usage : currentUsage ,
93+ Cost : currentCost ,
94+ },
95+ }
96+ sess .AddMessage (msg )
97+ currentContent .Reset ()
98+ currentReasoningContent .Reset ()
99+ currentToolCalls = nil
100+ currentToolDefinitions = nil
101+ currentModel = ""
102+ currentUsage = nil
103+ currentCost = 0
104+ }
105+ }
106+
107+ for _ , event := range events {
108+ eventType , _ := event ["type" ].(string )
109+
110+ switch eventType {
111+ case "agent_choice" :
112+ // Accumulate agent response content
113+ if content , ok := event ["content" ].(string ); ok {
114+ currentContent .WriteString (content )
115+ }
116+ if agentName , ok := event ["agent_name" ].(string ); ok && agentName != "" {
117+ currentAgentName = agentName
118+ }
119+
120+ case "agent_choice_reasoning" :
121+ // Accumulate reasoning content (for models like DeepSeek, Claude with extended thinking)
122+ if content , ok := event ["content" ].(string ); ok {
123+ currentReasoningContent .WriteString (content )
124+ }
125+ if agentName , ok := event ["agent_name" ].(string ); ok && agentName != "" {
126+ currentAgentName = agentName
127+ }
128+
129+ case "tool_call" :
130+ // Parse tool call and add to current message
131+ if tc , ok := event ["tool_call" ].(map [string ]any ); ok {
132+ toolCall := parseToolCall (tc )
133+ currentToolCalls = append (currentToolCalls , toolCall )
134+ }
135+ // Parse tool definition if present
136+ if td , ok := event ["tool_definition" ].(map [string ]any ); ok {
137+ toolDef := parseToolDefinition (td )
138+ currentToolDefinitions = append (currentToolDefinitions , toolDef )
139+ } else {
140+ // Add empty tool definition to maintain index alignment with tool calls
141+ currentToolDefinitions = append (currentToolDefinitions , tools.Tool {})
142+ }
143+ if agentName , ok := event ["agent_name" ].(string ); ok && agentName != "" {
144+ currentAgentName = agentName
145+ }
146+
147+ case "tool_call_response" :
148+ // Flush any pending assistant message before adding tool response
149+ flushAssistantMessage ()
150+
151+ // Add tool response message
152+ if tc , ok := event ["tool_call" ].(map [string ]any ); ok {
153+ toolCallID , _ := tc ["id" ].(string )
154+ response , _ := event ["response" ].(string )
155+
156+ msg := & session.Message {
157+ Message : chat.Message {
158+ Role : chat .MessageRoleTool ,
159+ Content : response ,
160+ ToolCallID : toolCallID ,
161+ CreatedAt : time .Now ().Format (time .RFC3339 ),
162+ },
163+ }
164+ sess .AddMessage (msg )
165+ }
166+
167+ case "token_usage" :
168+ // Update session token usage
169+ if usage , ok := event ["usage" ].(map [string ]any ); ok {
170+ if inputTokens , ok := usage ["input_tokens" ].(float64 ); ok {
171+ sess .InputTokens = int64 (inputTokens )
172+ }
173+ if outputTokens , ok := usage ["output_tokens" ].(float64 ); ok {
174+ sess .OutputTokens = int64 (outputTokens )
175+ }
176+ if cost , ok := usage ["cost" ].(float64 ); ok {
177+ sess .Cost = cost
178+ }
179+ // Extract per-message usage if available
180+ if lastMsg , ok := usage ["last_message" ].(map [string ]any ); ok {
181+ currentUsage = parseMessageUsage (lastMsg )
182+ if model , ok := lastMsg ["Model" ].(string ); ok {
183+ currentModel = model
184+ }
185+ if msgCost , ok := lastMsg ["Cost" ].(float64 ); ok {
186+ currentCost = msgCost
187+ }
188+ }
189+ }
190+
191+ case "error" :
192+ // Flush any pending assistant message before adding error
193+ flushAssistantMessage ()
194+
195+ // Add error as a system message so it's visible in the session
196+ if errorMsg , ok := event ["error" ].(string ); ok && errorMsg != "" {
197+ msg := & session.Message {
198+ Message : chat.Message {
199+ Role : chat .MessageRoleSystem ,
200+ Content : "Error: " + errorMsg ,
201+ CreatedAt : time .Now ().Format (time .RFC3339 ),
202+ },
203+ }
204+ sess .AddMessage (msg )
205+ }
206+
207+ case "session_title" :
208+ // Update session title if provided (may override the one from eval config)
209+ if eventTitle , ok := event ["title" ].(string ); ok && eventTitle != "" {
210+ sess .Title = eventTitle
211+ }
212+
213+ case "stream_stopped" :
214+ // Flush final assistant message
215+ flushAssistantMessage ()
216+ }
217+ }
218+
219+ // Flush any remaining content
220+ flushAssistantMessage ()
221+
222+ return sess
223+ }
224+
225+ // parseToolCall converts a map representation of a tool call to tools.ToolCall
226+ func parseToolCall (tc map [string ]any ) tools.ToolCall {
227+ toolCall := tools.ToolCall {}
228+
229+ if id , ok := tc ["id" ].(string ); ok {
230+ toolCall .ID = id
231+ }
232+ if typ , ok := tc ["type" ].(string ); ok {
233+ toolCall .Type = tools .ToolType (typ )
234+ }
235+
236+ if fn , ok := tc ["function" ].(map [string ]any ); ok {
237+ if name , ok := fn ["name" ].(string ); ok {
238+ toolCall .Function .Name = name
239+ }
240+ if args , ok := fn ["arguments" ].(string ); ok {
241+ toolCall .Function .Arguments = args
242+ }
243+ }
244+
245+ return toolCall
246+ }
247+
248+ // parseToolDefinition converts a map representation of a tool definition to tools.Tool
249+ func parseToolDefinition (td map [string ]any ) tools.Tool {
250+ toolDef := tools.Tool {}
251+
252+ if name , ok := td ["name" ].(string ); ok {
253+ toolDef .Name = name
254+ }
255+ if category , ok := td ["category" ].(string ); ok {
256+ toolDef .Category = category
257+ }
258+ if description , ok := td ["description" ].(string ); ok {
259+ toolDef .Description = description
260+ }
261+ if parameters , ok := td ["parameters" ]; ok {
262+ toolDef .Parameters = parameters
263+ }
264+
265+ return toolDef
266+ }
267+
268+ // parseMessageUsage converts a map representation of message usage to chat.Usage
269+ // Note: The embedded chat.Usage fields use snake_case JSON tags (input_tokens, etc.)
270+ // while Cost and Model don't have JSON tags and serialize with capitalized names.
271+ func parseMessageUsage (m map [string ]any ) * chat.Usage {
272+ usage := & chat.Usage {}
273+
274+ // Try snake_case first (from JSON serialization), then capitalized (fallback)
275+ if v , ok := m ["input_tokens" ].(float64 ); ok {
276+ usage .InputTokens = int64 (v )
277+ } else if v , ok := m ["InputTokens" ].(float64 ); ok {
278+ usage .InputTokens = int64 (v )
279+ }
280+ if v , ok := m ["output_tokens" ].(float64 ); ok {
281+ usage .OutputTokens = int64 (v )
282+ } else if v , ok := m ["OutputTokens" ].(float64 ); ok {
283+ usage .OutputTokens = int64 (v )
284+ }
285+ if v , ok := m ["cached_input_tokens" ].(float64 ); ok {
286+ usage .CachedInputTokens = int64 (v )
287+ } else if v , ok := m ["CachedInputTokens" ].(float64 ); ok {
288+ usage .CachedInputTokens = int64 (v )
289+ }
290+ if v , ok := m ["cached_write_tokens" ].(float64 ); ok {
291+ usage .CacheWriteTokens = int64 (v )
292+ } else if v , ok := m ["CacheWriteTokens" ].(float64 ); ok {
293+ usage .CacheWriteTokens = int64 (v )
294+ }
295+ if v , ok := m ["reasoning_tokens" ].(float64 ); ok {
296+ usage .ReasoningTokens = int64 (v )
297+ } else if v , ok := m ["ReasoningTokens" ].(float64 ); ok {
298+ usage .ReasoningTokens = int64 (v )
299+ }
300+
301+ return usage
302+ }
303+
304+ // SaveRunJSON saves the eval run results to a JSON file.
305+ // This is kept for backward compatibility and debugging purposes.
13306func SaveRunJSON (run * EvalRun , outputDir string ) (string , error ) {
14307 return saveJSON (run , filepath .Join (outputDir , run .Name + ".json" ))
15308}
0 commit comments