Merge pull request #1597 from dgageot/cagent-evals-json

dgageot · web-flow · commit 7568d819c35f · 2026-02-04T19:44:41.000+01:00
Put back the sessions json. It was useful after all
diff --git a/cmd/root/eval.go b/cmd/root/eval.go
@@ -124,7 +124,15 @@ func (f *evalFlags) runEvalCommand(cmd *cobra.Command, args []string) error {
 	if err != nil {
 		slog.Error("Failed to save sessions database", "error", err)
 	} else {
-		fmt.Fprintf(teeOut, "\nSessions: %s\n", dbPath)
+		fmt.Fprintf(teeOut, "\nSessions DB: %s\n", dbPath)
+	}
+
+	// Save sessions to JSON file (same format as /eval produces)
+	sessionsPath, err := evaluation.SaveRunSessionsJSON(run, outputDir)
+	if err != nil {
+		slog.Error("Failed to save sessions JSON", "error", err)
+	} else {
+		fmt.Fprintf(teeOut, "Sessions JSON: %s\n", sessionsPath)
 	}
 
 	fmt.Fprintf(teeOut, "Log: %s\n", logPath)
diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
@@ -24,6 +24,7 @@ import (
 	"github.com/docker/cagent/pkg/environment"
 	"github.com/docker/cagent/pkg/model/provider"
 	"github.com/docker/cagent/pkg/model/provider/options"
+	"github.com/docker/cagent/pkg/session"
 )
 
 // Runner runs evaluations against an agent.
@@ -98,7 +99,7 @@ func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName st
 // workItem represents a single evaluation to be processed.
 type workItem struct {
 	index int
-	eval  *EvalSession
+	eval  *InputSession
 }
 
 // Run executes all evaluations concurrently and returns results.
@@ -163,13 +164,13 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
 	return results, nil
 }
 
-func (r *Runner) loadEvalSessions(ctx context.Context) ([]EvalSession, error) {
+func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
 	entries, err := os.ReadDir(r.EvalsDir)
 	if err != nil {
 		return nil, err
 	}
 
-	var evals []EvalSession
+	var evals []InputSession
 	for _, entry := range entries {
 		if ctx.Err() != nil {
 			return nil, ctx.Err()
@@ -190,22 +191,19 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]EvalSession, error) {
 			return nil, err
 		}
 
-		var evalSess EvalSession
+		var evalSess session.Session
 		if err := json.Unmarshal(data, &evalSess); err != nil {
 			return nil, err
 		}
 
-		evalSess.SourcePath = filepath.Join(r.EvalsDir, fileName)
-
-		if evalSess.Title == "" {
-			evalSess.Title = strings.TrimSuffix(fileName, ".json")
-		}
-
-		evals = append(evals, evalSess)
+		evals = append(evals, InputSession{
+			Session:    &evalSess,
+			SourcePath: filepath.Join(r.EvalsDir, fileName),
+		})
 	}
 
 	// Sort by duration (longest first) to avoid long tail
-	slices.SortFunc(evals, func(a, b EvalSession) int {
+	slices.SortFunc(evals, func(a, b InputSession) int {
 		return cmp.Compare(b.Duration(), a.Duration())
 	})
 
@@ -214,11 +212,13 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]EvalSession, error) {
 
 // preBuildImages pre-builds all unique Docker images needed for the evaluations.
 // This is done in parallel to avoid serialized builds during evaluation.
-func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []EvalSession) error {
+func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []InputSession) error {
 	// Collect unique working directories
 	workingDirs := make(map[string]struct{})
 	for _, eval := range evals {
-		workingDirs[eval.Evals.WorkingDir] = struct{}{}
+		if eval.Evals != nil {
+			workingDirs[eval.Evals.WorkingDir] = struct{}{}
+		}
 	}
 
 	if len(workingDirs) == 0 {
@@ -278,24 +278,31 @@ func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []Eval
 	return nil
 }
 
-func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Result, error) {
+func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Result, error) {
 	startTime := time.Now()
 	slog.Debug("Starting evaluation", "title", evalSess.Title)
 
+	var evals *session.EvalCriteria
+	if evalSess.Evals != nil {
+		evals = evalSess.Evals
+	} else {
+		evals = &session.EvalCriteria{}
+	}
+
 	result := Result{
 		InputPath:         evalSess.SourcePath,
 		Title:             evalSess.Title,
-		Question:          getFirstUserMessage(&evalSess.Session),
-		SizeExpected:      evalSess.Evals.Size,
-		RelevanceExpected: float64(len(evalSess.Evals.Relevance)),
+		Question:          getFirstUserMessage(evalSess.Session),
+		SizeExpected:      evals.Size,
+		RelevanceExpected: float64(len(evals.Relevance)),
 	}
 
 	expectedToolCalls := extractToolCalls(evalSess.Messages)
 	if len(expectedToolCalls) > 0 {
 		result.ToolCallsExpected = 1.0
 	}
 
-	workingDir := evalSess.Evals.WorkingDir
+	workingDir := evals.WorkingDir
 
 	imageID, err := r.getOrBuildImage(ctx, workingDir)
 	if err != nil {
@@ -316,15 +323,16 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Resu
 
 	// Build session from events for database storage
 	result.Session = SessionFromEvents(events, evalSess.Title, result.Question)
+	result.Session.Evals = evals
 
 	if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {
 		result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
 	}
 
 	result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls)
 
-	if r.judge != nil && len(evalSess.Evals.Relevance) > 0 {
-		passed, failed, errs := r.judge.CheckRelevance(ctx, result.Response, evalSess.Evals.Relevance)
+	if r.judge != nil && len(evals.Relevance) > 0 {
+		passed, failed, errs := r.judge.CheckRelevance(ctx, result.Response, evals.Relevance)
 		result.RelevancePassed = float64(passed)
 		result.FailedRelevance = failed
 		for _, e := range errs {
diff --git a/pkg/evaluation/save.go b/pkg/evaluation/save.go
@@ -307,6 +307,23 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
 	return saveJSON(run, filepath.Join(outputDir, run.Name+".json"))
 }
 
+// SaveRunSessionsJSON saves all eval sessions to a single JSON file.
+// Each session includes its eval criteria in the "evals" field.
+// This complements SaveRunSessions which saves to SQLite, providing a
+// human-readable format for inspection.
+func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
+	// Collect all sessions from results
+	var sessions []*session.Session
+	for i := range run.Results {
+		if run.Results[i].Session != nil {
+			sessions = append(sessions, run.Results[i].Session)
+		}
+	}
+
+	outputPath := filepath.Join(outputDir, run.Name+".json")
+	return saveJSON(sessions, outputPath)
+}
+
 func Save(sess *session.Session, filename string) (string, error) {
 	baseName := cmp.Or(filename, sess.ID)
 
@@ -319,6 +336,11 @@ func Save(sess *session.Session, filename string) (string, error) {
 		evalFile = filepath.Join("evals", fmt.Sprintf("%s_%d.json", baseName, number))
 	}
 
+	// Ensure session has empty eval criteria for easier discovery
+	if sess.Evals == nil {
+		sess.Evals = &session.EvalCriteria{Relevance: []string{}}
+	}
+
 	return saveJSON(sess, evalFile)
 }
 
diff --git a/pkg/evaluation/save_test.go b/pkg/evaluation/save_test.go
@@ -1,6 +1,8 @@
 package evaluation
 
 import (
+	"encoding/json"
+	"os"
 	"path/filepath"
 	"testing"
 	"time"
@@ -27,6 +29,15 @@ func TestSaveWithCustomFilename(t *testing.T) {
 	require.Equal(t, filepath.Join("evals", "my-custom-eval.json"), evalFile)
 	require.FileExists(t, evalFile)
 
+	// Verify the saved file contains the evals field
+	data, err := os.ReadFile(evalFile)
+	require.NoError(t, err)
+	var savedSession session.Session
+	err = json.Unmarshal(data, &savedSession)
+	require.NoError(t, err)
+	assert.NotNil(t, savedSession.Evals)
+	assert.Empty(t, savedSession.Evals.Relevance)
+
 	// Test 2: Save without filename (should use session ID)
 	evalFile2, err := Save(sess, "")
 	require.NoError(t, err)
@@ -107,6 +118,92 @@ func TestSaveRunSessions(t *testing.T) {
 	assert.True(t, titles["eval-test-2"], "should have eval-test-2")
 }
 
+func TestSaveRunSessionsJSON(t *testing.T) {
+	t.Parallel()
+
+	outputDir := t.TempDir()
+
+	// Create sessions with different content
+	sess1 := session.New(
+		session.WithTitle("eval-json-1"),
+		session.WithUserMessage("What is the capital of France?"),
+	)
+	sess1.InputTokens = 100
+	sess1.OutputTokens = 50
+	sess1.Cost = 0.01
+
+	sess2 := session.New(
+		session.WithTitle("eval-json-2"),
+		session.WithUserMessage("What is 2+2?"),
+	)
+	sess2.InputTokens = 80
+	sess2.OutputTokens = 30
+	sess2.Cost = 0.005
+
+	// Create an eval run with sessions and eval criteria
+	run := &EvalRun{
+		Name:      "test-json-001",
+		Timestamp: time.Now(),
+		Results: []Result{
+			{
+				Title:    "eval-json-1",
+				Question: "What is the capital of France?",
+				Response: "Paris is the capital of France.",
+				Session:  sess1,
+			},
+			{
+				Title:    "eval-json-2",
+				Question: "What is 2+2?",
+				Response: "4",
+				Session:  sess2,
+			},
+			{
+				// Result without a session (error case)
+				Title:   "eval-json-3",
+				Error:   "container failed",
+				Session: nil,
+			},
+		},
+	}
+
+	// Save sessions to JSON
+	sessionsPath, err := SaveRunSessionsJSON(run, outputDir)
+	require.NoError(t, err)
+	assert.Equal(t, filepath.Join(outputDir, "test-json-001.json"), sessionsPath)
+	assert.FileExists(t, sessionsPath)
+
+	// Read and parse the JSON file
+	data, err := os.ReadFile(sessionsPath)
+	require.NoError(t, err)
+
+	var loadedSessions []*session.Session
+	err = json.Unmarshal(data, &loadedSessions)
+	require.NoError(t, err)
+
+	// Should have 2 sessions (excluding the error case)
+	assert.Len(t, loadedSessions, 2)
+
+	// Verify session content
+	titles := make(map[string]*session.Session)
+	for _, sess := range loadedSessions {
+		titles[sess.Title] = sess
+	}
+
+	assert.Contains(t, titles, "eval-json-1")
+	assert.Contains(t, titles, "eval-json-2")
+
+	// Verify cost and token data is preserved
+	sess1Loaded := titles["eval-json-1"]
+	assert.Equal(t, int64(100), sess1Loaded.InputTokens)
+	assert.Equal(t, int64(50), sess1Loaded.OutputTokens)
+	assert.InDelta(t, 0.01, sess1Loaded.Cost, 0.0001)
+
+	sess2Loaded := titles["eval-json-2"]
+	assert.Equal(t, int64(80), sess2Loaded.InputTokens)
+	assert.Equal(t, int64(30), sess2Loaded.OutputTokens)
+	assert.InDelta(t, 0.005, sess2Loaded.Cost, 0.0001)
+}
+
 func TestSaveRunSessionsWithCost(t *testing.T) {
 	t.Parallel()
 
diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go
@@ -7,18 +7,10 @@ import (
 	"github.com/docker/cagent/pkg/session"
 )
 
-// EvalCriteria contains the evaluation criteria for a test case.
-type EvalCriteria struct {
-	Relevance  []string `json:"relevance,omitempty"`   // Statements that should be true about the response
-	WorkingDir string   `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
-	Size       string   `json:"size,omitempty"`        // Expected response size: S, M, L, XL
-}
-
-// EvalSession extends session.Session with evaluation criteria.
-type EvalSession struct {
-	session.Session
-	Evals      EvalCriteria `json:"evals"`
-	SourcePath string       `json:"-"` // Path to the source eval file (not serialized)
+// InputSession wraps a session with its source path for evaluation loading.
+type InputSession struct {
+	*session.Session
+	SourcePath string // Path to the source eval file (not serialized)
 }
 
 // Result contains the evaluation results for a single test case.
diff --git a/pkg/session/session.go b/pkg/session/session.go
@@ -54,6 +54,9 @@ type Session struct {
 	// Title is the title of the session, set by the runtime
 	Title string `json:"title"`
 
+	// Evals contains evaluation criteria for this session (used by eval framework)
+	Evals *EvalCriteria `json:"evals,omitempty"`
+
 	// Messages holds the conversation history (messages and sub-sessions)
 	Messages []Item `json:"messages"`
 
@@ -189,6 +192,13 @@ func NewSubSessionItem(subSession *Session) Item {
 	return Item{SubSession: subSession}
 }
 
+// EvalCriteria contains the evaluation criteria for a session.
+type EvalCriteria struct {
+	Relevance  []string `json:"relevance"`             // Statements that should be true about the response
+	WorkingDir string   `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
+	Size       string   `json:"size,omitempty"`        // Expected response size: S, M, L, XL
+}
+
 // Session helper methods
 
 // AddMessage adds a message to the session