Skip to content

Commit 21cff5d

Browse files
authored
Merge pull request #1583 from dgageot/eval-sessions
Produce a session db for the evals
2 parents 0001516 + 6dbc1dd commit 21cff5d

6 files changed

Lines changed: 771 additions & 9 deletions

File tree

cmd/root/eval.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,15 @@ func (f *evalFlags) runEvalCommand(cmd *cobra.Command, args []string) error {
118118
return evalErr
119119
}
120120

121-
// Save results JSON
122-
resultsPath, err := evaluation.SaveRunJSON(run, outputDir)
121+
// Save sessions to SQLite database
122+
dbPath, err := evaluation.SaveRunSessions(ctx, run, outputDir)
123123
if err != nil {
124-
slog.Error("Failed to save results", "error", err)
124+
slog.Error("Failed to save sessions database", "error", err)
125125
} else {
126-
fmt.Fprintf(teeOut, "\nResults: %s\n", resultsPath)
127-
fmt.Fprintf(teeOut, "Log: %s\n", logPath)
126+
fmt.Fprintf(teeOut, "\nSessions: %s\n", dbPath)
128127
}
129128

129+
fmt.Fprintf(teeOut, "Log: %s\n", logPath)
130+
130131
return evalErr
131132
}

pkg/evaluation/eval.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -312,9 +312,11 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Resu
312312
result.Response = response
313313
result.Cost = cost
314314
result.OutputTokens = outputTokens
315-
result.RawOutput = events
316315
result.Size = getResponseSize(result.Response)
317316

317+
// Build session from events for database storage
318+
result.Session = SessionFromEvents(events, evalSess.Title, result.Question)
319+
318320
if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {
319321
result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
320322
}

pkg/evaluation/save.go

Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,307 @@ package evaluation
22

33
import (
44
"cmp"
5+
"context"
56
"encoding/json"
67
"fmt"
78
"os"
89
"path/filepath"
10+
"strings"
11+
"time"
912

13+
"github.com/docker/cagent/pkg/chat"
1014
"github.com/docker/cagent/pkg/session"
15+
"github.com/docker/cagent/pkg/tools"
1116
)
1217

18+
// SaveRunSessions saves all eval sessions to a SQLite database file.
19+
// The database follows the same schema as the main session store,
20+
// allowing the sessions to be loaded and inspected using standard session tools.
21+
func SaveRunSessions(ctx context.Context, run *EvalRun, outputDir string) (string, error) {
22+
dbPath := filepath.Join(outputDir, run.Name+".db")
23+
24+
// Create output directory if needed
25+
if err := os.MkdirAll(outputDir, 0o755); err != nil {
26+
return "", fmt.Errorf("creating output directory: %w", err)
27+
}
28+
29+
// Create a new SQLite session store for this eval run
30+
store, err := session.NewSQLiteSessionStore(dbPath)
31+
if err != nil {
32+
return "", fmt.Errorf("creating session store: %w", err)
33+
}
34+
defer func() {
35+
if closer, ok := store.(interface{ Close() error }); ok {
36+
_ = closer.Close()
37+
}
38+
}()
39+
40+
// Save each result's session to the database
41+
for i := range run.Results {
42+
result := &run.Results[i]
43+
if result.Session == nil {
44+
continue
45+
}
46+
47+
if err := store.AddSession(ctx, result.Session); err != nil {
48+
return "", fmt.Errorf("saving session for %q: %w", result.Title, err)
49+
}
50+
}
51+
52+
return dbPath, nil
53+
}
54+
55+
// SessionFromEvents reconstructs a session from raw container output events.
56+
// This parses the JSON events emitted by cagent --json and builds a session
57+
// with the conversation history.
58+
func SessionFromEvents(events []map[string]any, title, question string) *session.Session {
59+
sess := session.New(
60+
session.WithTitle(title),
61+
session.WithToolsApproved(true),
62+
)
63+
64+
// Add the user question as the first message
65+
if question != "" {
66+
sess.AddMessage(session.UserMessage(question))
67+
}
68+
69+
// Track current assistant message being built
70+
var currentContent strings.Builder
71+
var currentReasoningContent strings.Builder
72+
var currentToolCalls []tools.ToolCall
73+
var currentToolDefinitions []tools.Tool
74+
var currentAgentName string
75+
var currentModel string
76+
var currentUsage *chat.Usage
77+
var currentCost float64
78+
79+
// Helper to flush current assistant message
80+
flushAssistantMessage := func() {
81+
if currentContent.Len() > 0 || currentReasoningContent.Len() > 0 || len(currentToolCalls) > 0 {
82+
msg := &session.Message{
83+
AgentName: currentAgentName,
84+
Message: chat.Message{
85+
Role: chat.MessageRoleAssistant,
86+
Content: currentContent.String(),
87+
ReasoningContent: currentReasoningContent.String(),
88+
ToolCalls: currentToolCalls,
89+
ToolDefinitions: currentToolDefinitions,
90+
CreatedAt: time.Now().Format(time.RFC3339),
91+
Model: currentModel,
92+
Usage: currentUsage,
93+
Cost: currentCost,
94+
},
95+
}
96+
sess.AddMessage(msg)
97+
currentContent.Reset()
98+
currentReasoningContent.Reset()
99+
currentToolCalls = nil
100+
currentToolDefinitions = nil
101+
currentModel = ""
102+
currentUsage = nil
103+
currentCost = 0
104+
}
105+
}
106+
107+
for _, event := range events {
108+
eventType, _ := event["type"].(string)
109+
110+
switch eventType {
111+
case "agent_choice":
112+
// Accumulate agent response content
113+
if content, ok := event["content"].(string); ok {
114+
currentContent.WriteString(content)
115+
}
116+
if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
117+
currentAgentName = agentName
118+
}
119+
120+
case "agent_choice_reasoning":
121+
// Accumulate reasoning content (for models like DeepSeek, Claude with extended thinking)
122+
if content, ok := event["content"].(string); ok {
123+
currentReasoningContent.WriteString(content)
124+
}
125+
if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
126+
currentAgentName = agentName
127+
}
128+
129+
case "tool_call":
130+
// Parse tool call and add to current message
131+
if tc, ok := event["tool_call"].(map[string]any); ok {
132+
toolCall := parseToolCall(tc)
133+
currentToolCalls = append(currentToolCalls, toolCall)
134+
}
135+
// Parse tool definition if present
136+
if td, ok := event["tool_definition"].(map[string]any); ok {
137+
toolDef := parseToolDefinition(td)
138+
currentToolDefinitions = append(currentToolDefinitions, toolDef)
139+
} else {
140+
// Add empty tool definition to maintain index alignment with tool calls
141+
currentToolDefinitions = append(currentToolDefinitions, tools.Tool{})
142+
}
143+
if agentName, ok := event["agent_name"].(string); ok && agentName != "" {
144+
currentAgentName = agentName
145+
}
146+
147+
case "tool_call_response":
148+
// Flush any pending assistant message before adding tool response
149+
flushAssistantMessage()
150+
151+
// Add tool response message
152+
if tc, ok := event["tool_call"].(map[string]any); ok {
153+
toolCallID, _ := tc["id"].(string)
154+
response, _ := event["response"].(string)
155+
156+
msg := &session.Message{
157+
Message: chat.Message{
158+
Role: chat.MessageRoleTool,
159+
Content: response,
160+
ToolCallID: toolCallID,
161+
CreatedAt: time.Now().Format(time.RFC3339),
162+
},
163+
}
164+
sess.AddMessage(msg)
165+
}
166+
167+
case "token_usage":
168+
// Update session token usage
169+
if usage, ok := event["usage"].(map[string]any); ok {
170+
if inputTokens, ok := usage["input_tokens"].(float64); ok {
171+
sess.InputTokens = int64(inputTokens)
172+
}
173+
if outputTokens, ok := usage["output_tokens"].(float64); ok {
174+
sess.OutputTokens = int64(outputTokens)
175+
}
176+
if cost, ok := usage["cost"].(float64); ok {
177+
sess.Cost = cost
178+
}
179+
// Extract per-message usage if available
180+
if lastMsg, ok := usage["last_message"].(map[string]any); ok {
181+
currentUsage = parseMessageUsage(lastMsg)
182+
if model, ok := lastMsg["Model"].(string); ok {
183+
currentModel = model
184+
}
185+
if msgCost, ok := lastMsg["Cost"].(float64); ok {
186+
currentCost = msgCost
187+
}
188+
}
189+
}
190+
191+
case "error":
192+
// Flush any pending assistant message before adding error
193+
flushAssistantMessage()
194+
195+
// Add error as a system message so it's visible in the session
196+
if errorMsg, ok := event["error"].(string); ok && errorMsg != "" {
197+
msg := &session.Message{
198+
Message: chat.Message{
199+
Role: chat.MessageRoleSystem,
200+
Content: "Error: " + errorMsg,
201+
CreatedAt: time.Now().Format(time.RFC3339),
202+
},
203+
}
204+
sess.AddMessage(msg)
205+
}
206+
207+
case "session_title":
208+
// Update session title if provided (may override the one from eval config)
209+
if eventTitle, ok := event["title"].(string); ok && eventTitle != "" {
210+
sess.Title = eventTitle
211+
}
212+
213+
case "stream_stopped":
214+
// Flush final assistant message
215+
flushAssistantMessage()
216+
}
217+
}
218+
219+
// Flush any remaining content
220+
flushAssistantMessage()
221+
222+
return sess
223+
}
224+
225+
// parseToolCall converts a map representation of a tool call to tools.ToolCall
226+
func parseToolCall(tc map[string]any) tools.ToolCall {
227+
toolCall := tools.ToolCall{}
228+
229+
if id, ok := tc["id"].(string); ok {
230+
toolCall.ID = id
231+
}
232+
if typ, ok := tc["type"].(string); ok {
233+
toolCall.Type = tools.ToolType(typ)
234+
}
235+
236+
if fn, ok := tc["function"].(map[string]any); ok {
237+
if name, ok := fn["name"].(string); ok {
238+
toolCall.Function.Name = name
239+
}
240+
if args, ok := fn["arguments"].(string); ok {
241+
toolCall.Function.Arguments = args
242+
}
243+
}
244+
245+
return toolCall
246+
}
247+
248+
// parseToolDefinition converts a map representation of a tool definition to tools.Tool
249+
func parseToolDefinition(td map[string]any) tools.Tool {
250+
toolDef := tools.Tool{}
251+
252+
if name, ok := td["name"].(string); ok {
253+
toolDef.Name = name
254+
}
255+
if category, ok := td["category"].(string); ok {
256+
toolDef.Category = category
257+
}
258+
if description, ok := td["description"].(string); ok {
259+
toolDef.Description = description
260+
}
261+
if parameters, ok := td["parameters"]; ok {
262+
toolDef.Parameters = parameters
263+
}
264+
265+
return toolDef
266+
}
267+
268+
// parseMessageUsage converts a map representation of message usage to chat.Usage
269+
// Note: The embedded chat.Usage fields use snake_case JSON tags (input_tokens, etc.)
270+
// while Cost and Model don't have JSON tags and serialize with capitalized names.
271+
func parseMessageUsage(m map[string]any) *chat.Usage {
272+
usage := &chat.Usage{}
273+
274+
// Try snake_case first (from JSON serialization), then capitalized (fallback)
275+
if v, ok := m["input_tokens"].(float64); ok {
276+
usage.InputTokens = int64(v)
277+
} else if v, ok := m["InputTokens"].(float64); ok {
278+
usage.InputTokens = int64(v)
279+
}
280+
if v, ok := m["output_tokens"].(float64); ok {
281+
usage.OutputTokens = int64(v)
282+
} else if v, ok := m["OutputTokens"].(float64); ok {
283+
usage.OutputTokens = int64(v)
284+
}
285+
if v, ok := m["cached_input_tokens"].(float64); ok {
286+
usage.CachedInputTokens = int64(v)
287+
} else if v, ok := m["CachedInputTokens"].(float64); ok {
288+
usage.CachedInputTokens = int64(v)
289+
}
290+
if v, ok := m["cached_write_tokens"].(float64); ok {
291+
usage.CacheWriteTokens = int64(v)
292+
} else if v, ok := m["CacheWriteTokens"].(float64); ok {
293+
usage.CacheWriteTokens = int64(v)
294+
}
295+
if v, ok := m["reasoning_tokens"].(float64); ok {
296+
usage.ReasoningTokens = int64(v)
297+
} else if v, ok := m["ReasoningTokens"].(float64); ok {
298+
usage.ReasoningTokens = int64(v)
299+
}
300+
301+
return usage
302+
}
303+
304+
// SaveRunJSON saves the eval run results to a JSON file.
305+
// This is kept for backward compatibility and debugging purposes.
13306
func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
14307
return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
15308
}

0 commit comments

Comments
 (0)