Skip to content

Commit 7568d81

Browse files
authored
Merge pull request #1597 from dgageot/cagent-evals-json
Put back the sessions json. It was useful after all
2 parents 3329c99 + bd2a487 commit 7568d81

6 files changed

Lines changed: 171 additions & 34 deletions

File tree

cmd/root/eval.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,15 @@ func (f *evalFlags) runEvalCommand(cmd *cobra.Command, args []string) error {
124124
if err != nil {
125125
slog.Error("Failed to save sessions database", "error", err)
126126
} else {
127-
fmt.Fprintf(teeOut, "\nSessions: %s\n", dbPath)
127+
fmt.Fprintf(teeOut, "\nSessions DB: %s\n", dbPath)
128+
}
129+
130+
// Save sessions to JSON file (same format as /eval produces)
131+
sessionsPath, err := evaluation.SaveRunSessionsJSON(run, outputDir)
132+
if err != nil {
133+
slog.Error("Failed to save sessions JSON", "error", err)
134+
} else {
135+
fmt.Fprintf(teeOut, "Sessions JSON: %s\n", sessionsPath)
128136
}
129137

130138
fmt.Fprintf(teeOut, "Log: %s\n", logPath)

pkg/evaluation/eval.go

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424
"github.com/docker/cagent/pkg/environment"
2525
"github.com/docker/cagent/pkg/model/provider"
2626
"github.com/docker/cagent/pkg/model/provider/options"
27+
"github.com/docker/cagent/pkg/session"
2728
)
2829

2930
// Runner runs evaluations against an agent.
@@ -98,7 +99,7 @@ func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName st
9899
// workItem represents a single evaluation to be processed.
99100
type workItem struct {
100101
index int
101-
eval *EvalSession
102+
eval *InputSession
102103
}
103104

104105
// Run executes all evaluations concurrently and returns results.
@@ -163,13 +164,13 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
163164
return results, nil
164165
}
165166

166-
func (r *Runner) loadEvalSessions(ctx context.Context) ([]EvalSession, error) {
167+
func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
167168
entries, err := os.ReadDir(r.EvalsDir)
168169
if err != nil {
169170
return nil, err
170171
}
171172

172-
var evals []EvalSession
173+
var evals []InputSession
173174
for _, entry := range entries {
174175
if ctx.Err() != nil {
175176
return nil, ctx.Err()
@@ -190,22 +191,19 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]EvalSession, error) {
190191
return nil, err
191192
}
192193

193-
var evalSess EvalSession
194+
var evalSess session.Session
194195
if err := json.Unmarshal(data, &evalSess); err != nil {
195196
return nil, err
196197
}
197198

198-
evalSess.SourcePath = filepath.Join(r.EvalsDir, fileName)
199-
200-
if evalSess.Title == "" {
201-
evalSess.Title = strings.TrimSuffix(fileName, ".json")
202-
}
203-
204-
evals = append(evals, evalSess)
199+
evals = append(evals, InputSession{
200+
Session: &evalSess,
201+
SourcePath: filepath.Join(r.EvalsDir, fileName),
202+
})
205203
}
206204

207205
// Sort by duration (longest first) to avoid long tail
208-
slices.SortFunc(evals, func(a, b EvalSession) int {
206+
slices.SortFunc(evals, func(a, b InputSession) int {
209207
return cmp.Compare(b.Duration(), a.Duration())
210208
})
211209

@@ -214,11 +212,13 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]EvalSession, error) {
214212

215213
// preBuildImages pre-builds all unique Docker images needed for the evaluations.
216214
// This is done in parallel to avoid serialized builds during evaluation.
217-
func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []EvalSession) error {
215+
func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []InputSession) error {
218216
// Collect unique working directories
219217
workingDirs := make(map[string]struct{})
220218
for _, eval := range evals {
221-
workingDirs[eval.Evals.WorkingDir] = struct{}{}
219+
if eval.Evals != nil {
220+
workingDirs[eval.Evals.WorkingDir] = struct{}{}
221+
}
222222
}
223223

224224
if len(workingDirs) == 0 {
@@ -278,24 +278,31 @@ func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []Eval
278278
return nil
279279
}
280280

281-
func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Result, error) {
281+
func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Result, error) {
282282
startTime := time.Now()
283283
slog.Debug("Starting evaluation", "title", evalSess.Title)
284284

285+
var evals *session.EvalCriteria
286+
if evalSess.Evals != nil {
287+
evals = evalSess.Evals
288+
} else {
289+
evals = &session.EvalCriteria{}
290+
}
291+
285292
result := Result{
286293
InputPath: evalSess.SourcePath,
287294
Title: evalSess.Title,
288-
Question: getFirstUserMessage(&evalSess.Session),
289-
SizeExpected: evalSess.Evals.Size,
290-
RelevanceExpected: float64(len(evalSess.Evals.Relevance)),
295+
Question: getFirstUserMessage(evalSess.Session),
296+
SizeExpected: evals.Size,
297+
RelevanceExpected: float64(len(evals.Relevance)),
291298
}
292299

293300
expectedToolCalls := extractToolCalls(evalSess.Messages)
294301
if len(expectedToolCalls) > 0 {
295302
result.ToolCallsExpected = 1.0
296303
}
297304

298-
workingDir := evalSess.Evals.WorkingDir
305+
workingDir := evals.WorkingDir
299306

300307
imageID, err := r.getOrBuildImage(ctx, workingDir)
301308
if err != nil {
@@ -316,15 +323,16 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Resu
316323

317324
// Build session from events for database storage
318325
result.Session = SessionFromEvents(events, evalSess.Title, result.Question)
326+
result.Session.Evals = evals
319327

320328
if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {
321329
result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
322330
}
323331

324332
result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls)
325333

326-
if r.judge != nil && len(evalSess.Evals.Relevance) > 0 {
327-
passed, failed, errs := r.judge.CheckRelevance(ctx, result.Response, evalSess.Evals.Relevance)
334+
if r.judge != nil && len(evals.Relevance) > 0 {
335+
passed, failed, errs := r.judge.CheckRelevance(ctx, result.Response, evals.Relevance)
328336
result.RelevancePassed = float64(passed)
329337
result.FailedRelevance = failed
330338
for _, e := range errs {

pkg/evaluation/save.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,23 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
307307
return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
308308
}
309309

310+
// SaveRunSessionsJSON saves all eval sessions to a single JSON file.
311+
// Each session includes its eval criteria in the "evals" field.
312+
// This complements SaveRunSessions which saves to SQLite, providing a
313+
// human-readable format for inspection.
314+
func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
315+
// Collect all sessions from results
316+
var sessions []*session.Session
317+
for i := range run.Results {
318+
if run.Results[i].Session != nil {
319+
sessions = append(sessions, run.Results[i].Session)
320+
}
321+
}
322+
323+
outputPath := filepath.Join(outputDir, run.Name+".json")
324+
return saveJSON(sessions, outputPath)
325+
}
326+
310327
func Save(sess *session.Session, filename string) (string, error) {
311328
baseName := cmp.Or(filename, sess.ID)
312329

@@ -319,6 +336,11 @@ func Save(sess *session.Session, filename string) (string, error) {
319336
evalFile = filepath.Join("evals", fmt.Sprintf("%s_%d.json", baseName, number))
320337
}
321338

339+
// Ensure session has empty eval criteria for easier discovery
340+
if sess.Evals == nil {
341+
sess.Evals = &session.EvalCriteria{Relevance: []string{}}
342+
}
343+
322344
return saveJSON(sess, evalFile)
323345
}
324346

pkg/evaluation/save_test.go

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package evaluation
22

33
import (
4+
"encoding/json"
5+
"os"
46
"path/filepath"
57
"testing"
68
"time"
@@ -27,6 +29,15 @@ func TestSaveWithCustomFilename(t *testing.T) {
2729
require.Equal(t, filepath.Join("evals", "my-custom-eval.json"), evalFile)
2830
require.FileExists(t, evalFile)
2931

32+
// Verify the saved file contains the evals field
33+
data, err := os.ReadFile(evalFile)
34+
require.NoError(t, err)
35+
var savedSession session.Session
36+
err = json.Unmarshal(data, &savedSession)
37+
require.NoError(t, err)
38+
assert.NotNil(t, savedSession.Evals)
39+
assert.Empty(t, savedSession.Evals.Relevance)
40+
3041
// Test 2: Save without filename (should use session ID)
3142
evalFile2, err := Save(sess, "")
3243
require.NoError(t, err)
@@ -107,6 +118,92 @@ func TestSaveRunSessions(t *testing.T) {
107118
assert.True(t, titles["eval-test-2"], "should have eval-test-2")
108119
}
109120

121+
func TestSaveRunSessionsJSON(t *testing.T) {
122+
t.Parallel()
123+
124+
outputDir := t.TempDir()
125+
126+
// Create sessions with different content
127+
sess1 := session.New(
128+
session.WithTitle("eval-json-1"),
129+
session.WithUserMessage("What is the capital of France?"),
130+
)
131+
sess1.InputTokens = 100
132+
sess1.OutputTokens = 50
133+
sess1.Cost = 0.01
134+
135+
sess2 := session.New(
136+
session.WithTitle("eval-json-2"),
137+
session.WithUserMessage("What is 2+2?"),
138+
)
139+
sess2.InputTokens = 80
140+
sess2.OutputTokens = 30
141+
sess2.Cost = 0.005
142+
143+
// Create an eval run with sessions and eval criteria
144+
run := &EvalRun{
145+
Name: "test-json-001",
146+
Timestamp: time.Now(),
147+
Results: []Result{
148+
{
149+
Title: "eval-json-1",
150+
Question: "What is the capital of France?",
151+
Response: "Paris is the capital of France.",
152+
Session: sess1,
153+
},
154+
{
155+
Title: "eval-json-2",
156+
Question: "What is 2+2?",
157+
Response: "4",
158+
Session: sess2,
159+
},
160+
{
161+
// Result without a session (error case)
162+
Title: "eval-json-3",
163+
Error: "container failed",
164+
Session: nil,
165+
},
166+
},
167+
}
168+
169+
// Save sessions to JSON
170+
sessionsPath, err := SaveRunSessionsJSON(run, outputDir)
171+
require.NoError(t, err)
172+
assert.Equal(t, filepath.Join(outputDir, "test-json-001.json"), sessionsPath)
173+
assert.FileExists(t, sessionsPath)
174+
175+
// Read and parse the JSON file
176+
data, err := os.ReadFile(sessionsPath)
177+
require.NoError(t, err)
178+
179+
var loadedSessions []*session.Session
180+
err = json.Unmarshal(data, &loadedSessions)
181+
require.NoError(t, err)
182+
183+
// Should have 2 sessions (excluding the error case)
184+
assert.Len(t, loadedSessions, 2)
185+
186+
// Verify session content
187+
titles := make(map[string]*session.Session)
188+
for _, sess := range loadedSessions {
189+
titles[sess.Title] = sess
190+
}
191+
192+
assert.Contains(t, titles, "eval-json-1")
193+
assert.Contains(t, titles, "eval-json-2")
194+
195+
// Verify cost and token data is preserved
196+
sess1Loaded := titles["eval-json-1"]
197+
assert.Equal(t, int64(100), sess1Loaded.InputTokens)
198+
assert.Equal(t, int64(50), sess1Loaded.OutputTokens)
199+
assert.InDelta(t, 0.01, sess1Loaded.Cost, 0.0001)
200+
201+
sess2Loaded := titles["eval-json-2"]
202+
assert.Equal(t, int64(80), sess2Loaded.InputTokens)
203+
assert.Equal(t, int64(30), sess2Loaded.OutputTokens)
204+
assert.InDelta(t, 0.005, sess2Loaded.Cost, 0.0001)
205+
}
206+
110207
func TestSaveRunSessionsWithCost(t *testing.T) {
111208
t.Parallel()
112209

pkg/evaluation/types.go

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,10 @@ import (
77
"github.com/docker/cagent/pkg/session"
88
)
99

10-
// EvalCriteria contains the evaluation criteria for a test case.
11-
type EvalCriteria struct {
12-
Relevance []string `json:"relevance,omitempty"` // Statements that should be true about the response
13-
WorkingDir string `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
14-
Size string `json:"size,omitempty"` // Expected response size: S, M, L, XL
15-
}
16-
17-
// EvalSession extends session.Session with evaluation criteria.
18-
type EvalSession struct {
19-
session.Session
20-
Evals EvalCriteria `json:"evals"`
21-
SourcePath string `json:"-"` // Path to the source eval file (not serialized)
10+
// InputSession wraps a session with its source path for evaluation loading.
11+
type InputSession struct {
12+
*session.Session
13+
SourcePath string // Path to the source eval file (not serialized)
2214
}
2315

2416
// Result contains the evaluation results for a single test case.

pkg/session/session.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ type Session struct {
5454
// Title is the title of the session, set by the runtime
5555
Title string `json:"title"`
5656

57+
// Evals contains evaluation criteria for this session (used by eval framework)
58+
Evals *EvalCriteria `json:"evals,omitempty"`
59+
5760
// Messages holds the conversation history (messages and sub-sessions)
5861
Messages []Item `json:"messages"`
5962

@@ -189,6 +192,13 @@ func NewSubSessionItem(subSession *Session) Item {
189192
return Item{SubSession: subSession}
190193
}
191194

195+
// EvalCriteria contains the evaluation criteria for a session.
196+
type EvalCriteria struct {
197+
Relevance []string `json:"relevance"` // Statements that should be true about the response
198+
WorkingDir string `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
199+
Size string `json:"size,omitempty"` // Expected response size: S, M, L, XL
200+
}
201+
192202
// Session helper methods
193203

194204
// AddMessage adds a message to the session

0 commit comments

Comments
 (0)