Remove handoffs scoring from the evals

dgageot · dgageot · commit 388ae8b9fe90 · 2026-03-13T20:54:58.000+01:00
Signed-off-by: David Gageot &lt;david.gageot@docker.com&gt;
diff --git a/docs/features/evaluation/index.md b/docs/features/evaluation/index.md
@@ -130,7 +130,6 @@ docker-agent evaluates agents across four dimensions:
 | **Tool Calls (F1)** | F1 score between the expected tool call sequence (from the recorded session) and the actual tool calls made by the agent. |
 | **Relevance**       | An LLM judge (configurable via `--judge-model`) evaluates whether each relevance statement is satisfied by the response.  |
 | **Size**            | Whether the response length matches the expected size category (S/M/L/XL).                                                |
-| **Handoffs**        | For multi-agent configs, whether task delegation matched the expected agent handoff pattern.                              |
 
 ## Creating Eval Sessions
 
@@ -192,7 +191,6 @@ $ docker agent eval demo.yaml ./evals
 Summary: 2/2 passed
   Sizes:      0/0
   Tool Calls: avg F1 1.00 (2 evals)
-  Handoffs:   2/2
   Relevance:  3/3
 
 Sessions DB: ./evals/results/happy-panda-1234.db
diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
@@ -350,8 +350,6 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
 		result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
 	}
 
-	result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls)
-
 	if r.judge != nil && len(evals.Relevance) > 0 {
 		// Use transcript for relevance checking to preserve temporal ordering
 		transcript := buildTranscript(events)
diff --git a/pkg/evaluation/eval_test.go b/pkg/evaluation/eval_test.go
@@ -126,30 +126,6 @@ func TestGetResponseSize(t *testing.T) {
 	}
 }
 
-func TestCountHandoffs(t *testing.T) {
-	t.Parallel()
-
-	tests := []struct {
-		name      string
-		toolCalls []string
-		want      int
-	}{
-		{"no tool calls", []string{}, 0},
-		{"no handoffs", []string{"search", "read_file"}, 0},
-		{"one handoff", []string{"handoff", "read_file"}, 1},
-		{"one transfer_task", []string{"transfer_task", "read_file"}, 0},
-		{"multiple handoffs", []string{"handoff", "transfer_task", "handoff"}, 2},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			t.Parallel()
-			got := countHandoffs(tt.toolCalls)
-			assert.Equal(t, tt.want, got)
-		})
-	}
-}
-
 func TestParseJudgeResponse(t *testing.T) {
 	t.Parallel()
 
@@ -202,32 +178,26 @@ func TestResultCheckResults(t *testing.T) {
 		},
 		{
 			name:         "all checks pass",
-			result:       Result{SizeExpected: "M", Size: "M", ToolCallsExpected: 1, ToolCallsScore: 1.0, HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 2},
-			wantSuccess:  []string{"size M", "tool calls", "handoffs", "relevance 2/2"},
+			result:       Result{SizeExpected: "M", Size: "M", ToolCallsExpected: 1, ToolCallsScore: 1.0, RelevanceExpected: 2, RelevancePassed: 2},
+			wantSuccess:  []string{"size M", "tool calls", "relevance 2/2"},
 			wantFailures: nil,
 		},
 		{
 			name:         "size mismatch",
-			result:       Result{SizeExpected: "M", Size: "S", HandoffsMatch: true},
-			wantSuccess:  []string{"handoffs"},
+			result:       Result{SizeExpected: "M", Size: "S"},
+			wantSuccess:  nil,
 			wantFailures: []string{"size expected M, got S"},
 		},
 		{
 			name:         "tool calls failed",
-			result:       Result{ToolCallsExpected: 1, ToolCallsScore: 0.5, HandoffsMatch: true},
-			wantSuccess:  []string{"handoffs"},
-			wantFailures: []string{"tool calls score 0.50"},
-		},
-		{
-			name:         "handoffs mismatch",
-			result:       Result{HandoffsMatch: false},
+			result:       Result{ToolCallsExpected: 1, ToolCallsScore: 0.5},
 			wantSuccess:  nil,
-			wantFailures: []string{"handoffs mismatch"},
+			wantFailures: []string{"tool calls score 0.50"},
 		},
 		{
 			name:         "relevance failures listed",
-			result:       Result{HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}},
-			wantSuccess:  []string{"handoffs"},
+			result:       Result{RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}},
+			wantSuccess:  nil,
 			wantFailures: []string{"relevance: check A (reason: reason A)", "relevance: check B (reason: reason B)"},
 		},
 	}
@@ -252,82 +222,68 @@ func TestComputeSummary(t *testing.T) {
 		wantTotalEvals     int
 		wantSizesPassed    int
 		wantSizesTotal     int
-		wantHandoffs       int
-		wantHandoffsTotal  int
 		wantRelevance      float64
 		wantRelevanceTotal float64
 	}{
 		{
-			name:              "no results",
-			results:           []Result{},
-			wantTotalCost:     0,
-			wantTotalEvals:    0,
-			wantSizesPassed:   0,
-			wantSizesTotal:    0,
-			wantHandoffs:      0,
-			wantHandoffsTotal: 0,
+			name:            "no results",
+			results:         []Result{},
+			wantTotalCost:   0,
+			wantTotalEvals:  0,
+			wantSizesPassed: 0,
+			wantSizesTotal:  0,
 		},
 		{
 			name: "all passed",
 			results: []Result{
 				{
-					Title:         "session1",
-					Cost:          0.01,
-					SizeExpected:  "M",
-					Size:          "M",
-					HandoffsMatch: true,
+					Title:        "session1",
+					Cost:         0.01,
+					SizeExpected: "M",
+					Size:         "M",
 				},
 			},
-			wantTotalCost:     0.01,
-			wantTotalEvals:    1,
-			wantSizesPassed:   1,
-			wantSizesTotal:    1,
-			wantHandoffs:      1,
-			wantHandoffsTotal: 1,
+			wantTotalCost:   0.01,
+			wantTotalEvals:  1,
+			wantSizesPassed: 1,
+			wantSizesTotal:  1,
 		},
 		{
 			name: "size mismatch",
 			results: []Result{
 				{
-					Title:         "session1",
-					SizeExpected:  "M",
-					Size:          "S",
-					HandoffsMatch: true,
+					Title:        "session1",
+					SizeExpected: "M",
+					Size:         "S",
 				},
 			},
-			wantTotalEvals:    1,
-			wantSizesPassed:   0,
-			wantSizesTotal:    1,
-			wantHandoffs:      1,
-			wantHandoffsTotal: 1,
+			wantTotalEvals:  1,
+			wantSizesPassed: 0,
+			wantSizesTotal:  1,
 		},
 		{
 			name: "multiple sessions",
 			results: []Result{
-				{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", HandoffsMatch: true},
-				{Title: "session2", Cost: 0.02, SizeExpected: "L", Size: "S", HandoffsMatch: false},
-				{Title: "session3", Cost: 0.03, HandoffsMatch: true},
+				{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M"},
+				{Title: "session2", Cost: 0.02, SizeExpected: "L", Size: "S"},
+				{Title: "session3", Cost: 0.03},
 			},
-			wantTotalCost:     0.06,
-			wantTotalEvals:    3,
-			wantSizesPassed:   1,
-			wantSizesTotal:    2,
-			wantHandoffs:      2,
-			wantHandoffsTotal: 3,
+			wantTotalCost:   0.06,
+			wantTotalEvals:  3,
+			wantSizesPassed: 1,
+			wantSizesTotal:  2,
 		},
 		{
 			name: "errored results excluded from totals",
 			results: []Result{
-				{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 2},
+				{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", RelevanceExpected: 2, RelevancePassed: 2},
 				{Title: "session2", Cost: 0.02, Error: "docker build failed", SizeExpected: "L", RelevanceExpected: 2},
 				{Title: "session3", Cost: 0.00, Error: "timeout", RelevanceExpected: 3},
 			},
 			wantTotalCost:      0.03, // cost is still counted
 			wantTotalEvals:     3,
 			wantSizesPassed:    1,
 			wantSizesTotal:     1, // only non-errored results count
-			wantHandoffs:       1,
-			wantHandoffsTotal:  1, // only non-errored results count
 			wantRelevance:      2,
 			wantRelevanceTotal: 2, // only non-errored results count
 		},
@@ -341,8 +297,6 @@ func TestComputeSummary(t *testing.T) {
 			assert.InDelta(t, tt.wantTotalCost, summary.TotalCost, 0.0001)
 			assert.Equal(t, tt.wantSizesPassed, summary.SizesPassed)
 			assert.Equal(t, tt.wantSizesTotal, summary.SizesTotal)
-			assert.Equal(t, tt.wantHandoffs, summary.HandoffsPassed)
-			assert.Equal(t, tt.wantHandoffsTotal, summary.HandoffsTotal)
 			assert.InDelta(t, tt.wantRelevance, summary.RelevancePassed, 0.0001)
 			assert.InDelta(t, tt.wantRelevanceTotal, summary.RelevanceTotal, 0.0001)
 		})
@@ -377,14 +331,12 @@ func TestSaveRunJSON(t *testing.T) {
 		Timestamp: time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC),
 		Duration:  5 * time.Minute,
 		Results: []Result{
-			{Title: "test1", Cost: 0.01, HandoffsMatch: true},
+			{Title: "test1", Cost: 0.01},
 			{Title: "test2", Cost: 0.02, Error: "failed"},
 		},
 		Summary: Summary{
-			TotalEvals:     2,
-			TotalCost:      0.03,
-			HandoffsPassed: 1,
-			HandoffsTotal:  1,
+			TotalEvals: 2,
+			TotalCost:  0.03,
 		},
 	}
 
@@ -581,15 +533,12 @@ func TestPrintSummary(t *testing.T) {
 				TotalEvals:      10,
 				FailedEvals:     5,
 				TotalCost:       0.05,
-				HandoffsPassed:  3,
-				HandoffsTotal:   5,
 				RelevancePassed: 8,
 				RelevanceTotal:  10,
 			},
 			duration: 2 * time.Minute,
 			wantContains: []string{
 				"Errors: 5/10 evaluations failed",
-				"Handoffs: 3/5 passed",
 				"Relevance: 8/10 passed",
 				"Total Cost: $0.050000",
 				"Total Time: 2m0s",
@@ -602,15 +551,12 @@ func TestPrintSummary(t *testing.T) {
 				TotalCost:       0.1,
 				SizesPassed:     4,
 				SizesTotal:      5,
-				HandoffsPassed:  5,
-				HandoffsTotal:   5,
 				RelevancePassed: 10,
 				RelevanceTotal:  10,
 			},
 			duration: 1 * time.Minute,
 			wantContains: []string{
 				"Sizes: 4/5 passed",
-				"Handoffs: 5/5 passed",
 				"Relevance: 10/10 passed",
 				"Total Cost: $0.100000",
 			},
@@ -683,14 +629,12 @@ func TestProgressBarPrintResult(t *testing.T) {
 		{
 			name: "successful result",
 			result: Result{
-				Title:         "test-session",
-				Cost:          0.005,
-				HandoffsMatch: true,
+				Title: "test-session",
+				Cost:  0.005,
 			},
 			wantContains: []string{
 				"✓ test-session",
 				"$0.005000",
-				"✓ handoffs",
 			},
 		},
 		{
@@ -712,14 +656,12 @@ func TestProgressBarPrintResult(t *testing.T) {
 				Cost:              0.01,
 				SizeExpected:      "M",
 				Size:              "S",
-				HandoffsMatch:     true,
 				RelevanceExpected: 2,
 				RelevancePassed:   1,
 				FailedRelevance:   []RelevanceResult{{Criterion: "check failed", Reason: "did not meet criteria"}},
 			},
 			wantContains: []string{
 				"✗ mixed-session", // overall failed
-				"✓ handoffs",
 				"✗ size expected M, got S",
 				"✗ relevance: check failed (reason: did not meet criteria)",
 			},
diff --git a/pkg/evaluation/scoring.go b/pkg/evaluation/scoring.go
@@ -62,16 +62,6 @@ func countStrings(strs []string) map[string]int {
 	return counts
 }
 
-func countHandoffs(toolCalls []string) int {
-	count := 0
-	for _, name := range toolCalls {
-		if name == "handoff" {
-			count++
-		}
-	}
-	return count
-}
-
 func computeSummary(results []Result) Summary {
 	summary := Summary{
 		TotalEvals: len(results),
@@ -96,11 +86,6 @@ func computeSummary(results []Result) Summary {
 			summary.ToolsCount++
 		}
 
-		summary.HandoffsTotal++
-		if r.HandoffsMatch {
-			summary.HandoffsPassed++
-		}
-
 		summary.RelevanceTotal += r.RelevanceExpected
 		summary.RelevancePassed += r.RelevancePassed
 	}
@@ -118,7 +103,6 @@ func printSummary(out io.Writer, summary Summary, duration time.Duration) {
 
 	printMetric(out, "Sizes", summary.SizesPassed, summary.SizesTotal)
 	printF1Score(out, "Tool Calls", summary.ToolsF1Sum, summary.ToolsCount)
-	printMetric(out, "Handoffs", summary.HandoffsPassed, summary.HandoffsTotal)
 	printMetric(out, "Relevance", int(summary.RelevancePassed), int(summary.RelevanceTotal))
 
 	fmt.Fprintf(out, "\nTotal Cost: $%.6f\n", summary.TotalCost)
diff --git a/pkg/evaluation/types.go b/pkg/evaluation/types.go
@@ -25,7 +25,6 @@ type Result struct {
 	SizeExpected      string            `json:"size_expected"`
 	ToolCallsScore    float64           `json:"tool_calls_score"`
 	ToolCallsExpected float64           `json:"tool_calls_score_expected"`
-	HandoffsMatch     bool              `json:"handoffs"`
 	RelevancePassed   float64           `json:"relevance"`
 	RelevanceExpected float64           `json:"relevance_expected"`
 	FailedRelevance   []RelevanceResult `json:"failed_relevance,omitempty"`
@@ -58,13 +57,6 @@ func (r *Result) checkResults() (successes, failures []string) {
 		}
 	}
 
-	// Check handoffs
-	if r.HandoffsMatch {
-		successes = append(successes, "handoffs")
-	} else {
-		failures = append(failures, "handoffs mismatch")
-	}
-
 	// Check relevance
 	if r.RelevanceExpected > 0 {
 		if r.RelevancePassed >= r.RelevanceExpected {
@@ -92,8 +84,6 @@ type Summary struct {
 	SizesTotal      int     `json:"sizes_total"`
 	ToolsF1Sum      float64 `json:"tools_f1_sum"`
 	ToolsCount      int     `json:"tools_count"`
-	HandoffsPassed  int     `json:"handoffs_passed"`
-	HandoffsTotal   int     `json:"handoffs_total"`
 	RelevancePassed float64 `json:"relevance_passed"`
 	RelevanceTotal  float64 `json:"relevance_total"`
 }

Original file line number	Diff line number	Diff line change
`@@ -350,8 +350,6 @@ func (r Runner) runSingleEval(ctx context.Context, evalSess InputSession) (Res`
`350`	`350`	`result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)`
`351`	`351`	`}`
`352`	`352`
`353`		`- result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls)`
`354`		`-`
`355`	`353`	`if r.judge != nil && len(evals.Relevance) > 0 {`
`356`	`354`	`// Use transcript for relevance checking to preserve temporal ordering`
`357`	`355`	`transcript := buildTranscript(events)`