Add the criteria to the session

dgageot · dgageot · commit bd2a4871cfc2 · 2026-02-04T19:38:13.000+01:00
Signed-off-by: David Gageot &lt;david.gageot@docker.com&gt;
diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
@@ -24,6 +24,7 @@ import (
 	"github.com/docker/cagent/pkg/environment"
 	"github.com/docker/cagent/pkg/model/provider"
 	"github.com/docker/cagent/pkg/model/provider/options"
+	"github.com/docker/cagent/pkg/session"
 )
 
 // Runner runs evaluations against an agent.
@@ -98,7 +99,7 @@ func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName st
 // workItem represents a single evaluation to be processed.
 type workItem struct {
 	index int
-	eval  *EvalSession
+	eval  *InputSession
 }
 
 // Run executes all evaluations concurrently and returns results.
@@ -163,13 +164,13 @@ func (r *Runner) Run(ctx context.Context, ttyOut, out io.Writer, isTTY bool) ([]
 	return results, nil
 }
 
-func (r *Runner) loadEvalSessions(ctx context.Context) ([]EvalSession, error) {
+func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
 	entries, err := os.ReadDir(r.EvalsDir)
 	if err != nil {
 		return nil, err
 	}
 
-	var evals []EvalSession
+	var evals []InputSession
 	for _, entry := range entries {
 		if ctx.Err() != nil {
 			return nil, ctx.Err()
@@ -190,22 +191,19 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]EvalSession, error) {
 			return nil, err
 		}
 
-		var evalSess EvalSession
+		var evalSess session.Session
 		if err := json.Unmarshal(data, &evalSess); err != nil {
 			return nil, err
 		}
 
-		evalSess.SourcePath = filepath.Join(r.EvalsDir, fileName)
-
-		if evalSess.Title == "" {
-			evalSess.Title = strings.TrimSuffix(fileName, ".json")
-		}
-
-		evals = append(evals, evalSess)
+		evals = append(evals, InputSession{
+			Session:    &evalSess,
+			SourcePath: filepath.Join(r.EvalsDir, fileName),
+		})
 	}
 
 	// Sort by duration (longest first) to avoid long tail
-	slices.SortFunc(evals, func(a, b EvalSession) int {
+	slices.SortFunc(evals, func(a, b InputSession) int {
 		return cmp.Compare(b.Duration(), a.Duration())
 	})
 
@@ -214,11 +212,13 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]EvalSession, error) {
 
 // preBuildImages pre-builds all unique Docker images needed for the evaluations.
 // This is done in parallel to avoid serialized builds during evaluation.
-func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []EvalSession) error {
+func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []InputSession) error {
 	// Collect unique working directories
 	workingDirs := make(map[string]struct{})
 	for _, eval := range evals {
-		workingDirs[eval.Evals.WorkingDir] = struct{}{}
+		if eval.Evals != nil {
+			workingDirs[eval.Evals.WorkingDir] = struct{}{}
+		}
 	}
 
 	if len(workingDirs) == 0 {
@@ -278,24 +278,31 @@ func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []Eval
 	return nil
 }
 
-func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Result, error) {
+func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Result, error) {
 	startTime := time.Now()
 	slog.Debug("Starting evaluation", "title", evalSess.Title)
 
+	var evals *session.EvalCriteria
+	if evalSess.Evals != nil {
+		evals = evalSess.Evals
+	} else {
+		evals = &session.EvalCriteria{}
+	}
+
 	result := Result{
 		InputPath:         evalSess.SourcePath,
 		Title:             evalSess.Title,
-		Question:          getFirstUserMessage(&evalSess.Session),
-		SizeExpected:      evalSess.Evals.Size,
-		RelevanceExpected: float64(len(evalSess.Evals.Relevance)),
+		Question:          getFirstUserMessage(evalSess.Session),
+		SizeExpected:      evals.Size,
+		RelevanceExpected: float64(len(evals.Relevance)),
 	}
 
 	expectedToolCalls := extractToolCalls(evalSess.Messages)
 	if len(expectedToolCalls) > 0 {
 		result.ToolCallsExpected = 1.0
 	}
 
-	workingDir := evalSess.Evals.WorkingDir
+	workingDir := evals.WorkingDir
 
 	imageID, err := r.getOrBuildImage(ctx, workingDir)
 	if err != nil {
@@ -316,15 +323,16 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *EvalSession) (Resu
 
 	// Build session from events for database storage
 	result.Session = SessionFromEvents(events, evalSess.Title, result.Question)
+	result.Session.Evals = evals
 
 	if len(expectedToolCalls) > 0 || len(actualToolCalls) > 0 {
 		result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
 	}
 
 	result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls)
 
-	if r.judge != nil && len(evalSess.Evals.Relevance) > 0 {
-		passed, failed, errs := r.judge.CheckRelevance(ctx, result.Response, evalSess.Evals.Relevance)
+	if r.judge != nil && len(evals.Relevance) > 0 {
+		passed, failed, errs := r.judge.CheckRelevance(ctx, result.Response, evals.Relevance)
 		result.RelevancePassed = float64(passed)
 		result.FailedRelevance = failed
 		for _, e := range errs {
diff --git a/pkg/evaluation/save.go b/pkg/evaluation/save.go
@@ -308,7 +308,7 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {
 }
 
 // SaveRunSessionsJSON saves all eval sessions to a single JSON file.
-// Each session is saved in the same format as /eval produces (session.Session).
+// Each session includes its eval criteria in the "evals" field.
 // This complements SaveRunSessions which saves to SQLite, providing a
 // human-readable format for inspection.
 func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {
@@ -336,6 +336,11 @@ func Save(sess *session.Session, filename string) (string, error) {
 		evalFile = filepath.Join("evals", fmt.Sprintf("%s_%d.json", baseName, number))
 	}
 
+	// Ensure session has empty eval criteria for easier discovery
+	if sess.Evals == nil {
+		sess.Evals = &session.EvalCriteria{Relevance: []string{}}
+	}
+
 	return saveJSON(sess, evalFile)
 }
 
diff --git a/pkg/evaluation/save_test.go b/pkg/evaluation/save_test.go
@@ -29,6 +29,15 @@ func TestSaveWithCustomFilename(t *testing.T) {
 	require.Equal(t, filepath.Join("evals", "my-custom-eval.json"), evalFile)
 	require.FileExists(t, evalFile)
 
+	// Verify the saved file contains the evals field
+	data, err := os.ReadFile(evalFile)
+	require.NoError(t, err)
+	var savedSession session.Session
+	err = json.Unmarshal(data, &savedSession)
+	require.NoError(t, err)
+	assert.NotNil(t, savedSession.Evals)
+	assert.Empty(t, savedSession.Evals.Relevance)
+
 	// Test 2: Save without filename (should use session ID)
 	evalFile2, err := Save(sess, "")
 	require.NoError(t, err)
@@ -131,7 +140,7 @@ func TestSaveRunSessionsJSON(t *testing.T) {
 	sess2.OutputTokens = 30
 	sess2.Cost = 0.005
 
-	// Create an eval run with sessions
+	// Create an eval run with sessions and eval criteria
 	run := &EvalRun{
 		Name:      "test-json-001",
 		Timestamp: time.Now(),
diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go
@@ -7,18 +7,10 @@ import (
 	"github.com/docker/cagent/pkg/session"
 )
 
-// EvalCriteria contains the evaluation criteria for a test case.
-type EvalCriteria struct {
-	Relevance  []string `json:"relevance,omitempty"`   // Statements that should be true about the response
-	WorkingDir string   `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
-	Size       string   `json:"size,omitempty"`        // Expected response size: S, M, L, XL
-}
-
-// EvalSession extends session.Session with evaluation criteria.
-type EvalSession struct {
-	session.Session
-	Evals      EvalCriteria `json:"evals"`
-	SourcePath string       `json:"-"` // Path to the source eval file (not serialized)
+// InputSession wraps a session with its source path for evaluation loading.
+type InputSession struct {
+	*session.Session
+	SourcePath string // Path to the source eval file (not serialized)
 }
 
 // Result contains the evaluation results for a single test case.
diff --git a/pkg/session/session.go b/pkg/session/session.go
@@ -54,6 +54,9 @@ type Session struct {
 	// Title is the title of the session, set by the runtime
 	Title string `json:"title"`
 
+	// Evals contains evaluation criteria for this session (used by eval framework)
+	Evals *EvalCriteria `json:"evals,omitempty"`
+
 	// Messages holds the conversation history (messages and sub-sessions)
 	Messages []Item `json:"messages"`
 
@@ -189,6 +192,13 @@ func NewSubSessionItem(subSession *Session) Item {
 	return Item{SubSession: subSession}
 }
 
+// EvalCriteria contains the evaluation criteria for a session.
+type EvalCriteria struct {
+	Relevance  []string `json:"relevance"`             // Statements that should be true about the response
+	WorkingDir string   `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
+	Size       string   `json:"size,omitempty"`        // Expected response size: S, M, L, XL
+}
+
 // Session helper methods
 
 // AddMessage adds a message to the session

Original file line number	Diff line number	Diff line change
`@@ -308,7 +308,7 @@ func SaveRunJSON(run *EvalRun, outputDir string) (string, error) {`
`308`	`308`	`}`
`309`	`309`
`310`	`310`	`// SaveRunSessionsJSON saves all eval sessions to a single JSON file.`
`311`		`-// Each session is saved in the same format as /eval produces (session.Session).`
	`311`	`+// Each session includes its eval criteria in the "evals" field.`
`312`	`312`	`// This complements SaveRunSessions which saves to SQLite, providing a`
`313`	`313`	`// human-readable format for inspection.`
`314`	`314`	`func SaveRunSessionsJSON(run *EvalRun, outputDir string) (string, error) {`
`@@ -336,6 +336,11 @@ func Save(sess *session.Session, filename string) (string, error) {`
`336`	`336`	`evalFile = filepath.Join("evals", fmt.Sprintf("%s_%d.json", baseName, number))`
`337`	`337`	`}`
`338`	`338`
	`339`	`+ // Ensure session has empty eval criteria for easier discovery`
	`340`	`+ if sess.Evals == nil {`
	`341`	`+ sess.Evals = &session.EvalCriteria{Relevance: []string{}}`
	`342`	`+ }`
	`343`	`+`
`339`	`344`	`return saveJSON(sess, evalFile)`
`340`	`345`	`}`
`341`	`346`