Skip to content

Commit 388ae8b

Browse files
committed
Remove handoffs scoring from the evals
Signed-off-by: David Gageot <david.gageot@docker.com>
1 parent f11de8a commit 388ae8b

5 files changed

Lines changed: 41 additions & 129 deletions

File tree

docs/features/evaluation/index.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ docker-agent evaluates agents across four dimensions:
130130
| **Tool Calls (F1)** | F1 score between the expected tool call sequence (from the recorded session) and the actual tool calls made by the agent. |
131131
| **Relevance** | An LLM judge (configurable via `--judge-model`) evaluates whether each relevance statement is satisfied by the response. |
132132
| **Size** | Whether the response length matches the expected size category (S/M/L/XL). |
133-
| **Handoffs** | For multi-agent configs, whether task delegation matched the expected agent handoff pattern. |
134133

135134
## Creating Eval Sessions
136135

@@ -192,7 +191,6 @@ $ docker agent eval demo.yaml ./evals
192191
Summary: 2/2 passed
193192
Sizes: 0/0
194193
Tool Calls: avg F1 1.00 (2 evals)
195-
Handoffs: 2/2
196194
Relevance: 3/3
197195

198196
Sessions DB: ./evals/results/happy-panda-1234.db

pkg/evaluation/eval.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -350,8 +350,6 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
350350
result.ToolCallsScore = toolCallF1Score(expectedToolCalls, actualToolCalls)
351351
}
352352

353-
result.HandoffsMatch = countHandoffs(expectedToolCalls) == countHandoffs(actualToolCalls)
354-
355353
if r.judge != nil && len(evals.Relevance) > 0 {
356354
// Use transcript for relevance checking to preserve temporal ordering
357355
transcript := buildTranscript(events)

pkg/evaluation/eval_test.go

Lines changed: 41 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -126,30 +126,6 @@ func TestGetResponseSize(t *testing.T) {
126126
}
127127
}
128128

129-
func TestCountHandoffs(t *testing.T) {
130-
t.Parallel()
131-
132-
tests := []struct {
133-
name string
134-
toolCalls []string
135-
want int
136-
}{
137-
{"no tool calls", []string{}, 0},
138-
{"no handoffs", []string{"search", "read_file"}, 0},
139-
{"one handoff", []string{"handoff", "read_file"}, 1},
140-
{"one transfer_task", []string{"transfer_task", "read_file"}, 0},
141-
{"multiple handoffs", []string{"handoff", "transfer_task", "handoff"}, 2},
142-
}
143-
144-
for _, tt := range tests {
145-
t.Run(tt.name, func(t *testing.T) {
146-
t.Parallel()
147-
got := countHandoffs(tt.toolCalls)
148-
assert.Equal(t, tt.want, got)
149-
})
150-
}
151-
}
152-
153129
func TestParseJudgeResponse(t *testing.T) {
154130
t.Parallel()
155131

@@ -202,32 +178,26 @@ func TestResultCheckResults(t *testing.T) {
202178
},
203179
{
204180
name: "all checks pass",
205-
result: Result{SizeExpected: "M", Size: "M", ToolCallsExpected: 1, ToolCallsScore: 1.0, HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 2},
206-
wantSuccess: []string{"size M", "tool calls", "handoffs", "relevance 2/2"},
181+
result: Result{SizeExpected: "M", Size: "M", ToolCallsExpected: 1, ToolCallsScore: 1.0, RelevanceExpected: 2, RelevancePassed: 2},
182+
wantSuccess: []string{"size M", "tool calls", "relevance 2/2"},
207183
wantFailures: nil,
208184
},
209185
{
210186
name: "size mismatch",
211-
result: Result{SizeExpected: "M", Size: "S", HandoffsMatch: true},
212-
wantSuccess: []string{"handoffs"},
187+
result: Result{SizeExpected: "M", Size: "S"},
188+
wantSuccess: nil,
213189
wantFailures: []string{"size expected M, got S"},
214190
},
215191
{
216192
name: "tool calls failed",
217-
result: Result{ToolCallsExpected: 1, ToolCallsScore: 0.5, HandoffsMatch: true},
218-
wantSuccess: []string{"handoffs"},
219-
wantFailures: []string{"tool calls score 0.50"},
220-
},
221-
{
222-
name: "handoffs mismatch",
223-
result: Result{HandoffsMatch: false},
193+
result: Result{ToolCallsExpected: 1, ToolCallsScore: 0.5},
224194
wantSuccess: nil,
225-
wantFailures: []string{"handoffs mismatch"},
195+
wantFailures: []string{"tool calls score 0.50"},
226196
},
227197
{
228198
name: "relevance failures listed",
229-
result: Result{HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}},
230-
wantSuccess: []string{"handoffs"},
199+
result: Result{RelevanceExpected: 2, RelevancePassed: 0, FailedRelevance: []RelevanceResult{{Criterion: "check A", Reason: "reason A"}, {Criterion: "check B", Reason: "reason B"}}},
200+
wantSuccess: nil,
231201
wantFailures: []string{"relevance: check A (reason: reason A)", "relevance: check B (reason: reason B)"},
232202
},
233203
}
@@ -252,82 +222,68 @@ func TestComputeSummary(t *testing.T) {
252222
wantTotalEvals int
253223
wantSizesPassed int
254224
wantSizesTotal int
255-
wantHandoffs int
256-
wantHandoffsTotal int
257225
wantRelevance float64
258226
wantRelevanceTotal float64
259227
}{
260228
{
261-
name: "no results",
262-
results: []Result{},
263-
wantTotalCost: 0,
264-
wantTotalEvals: 0,
265-
wantSizesPassed: 0,
266-
wantSizesTotal: 0,
267-
wantHandoffs: 0,
268-
wantHandoffsTotal: 0,
229+
name: "no results",
230+
results: []Result{},
231+
wantTotalCost: 0,
232+
wantTotalEvals: 0,
233+
wantSizesPassed: 0,
234+
wantSizesTotal: 0,
269235
},
270236
{
271237
name: "all passed",
272238
results: []Result{
273239
{
274-
Title: "session1",
275-
Cost: 0.01,
276-
SizeExpected: "M",
277-
Size: "M",
278-
HandoffsMatch: true,
240+
Title: "session1",
241+
Cost: 0.01,
242+
SizeExpected: "M",
243+
Size: "M",
279244
},
280245
},
281-
wantTotalCost: 0.01,
282-
wantTotalEvals: 1,
283-
wantSizesPassed: 1,
284-
wantSizesTotal: 1,
285-
wantHandoffs: 1,
286-
wantHandoffsTotal: 1,
246+
wantTotalCost: 0.01,
247+
wantTotalEvals: 1,
248+
wantSizesPassed: 1,
249+
wantSizesTotal: 1,
287250
},
288251
{
289252
name: "size mismatch",
290253
results: []Result{
291254
{
292-
Title: "session1",
293-
SizeExpected: "M",
294-
Size: "S",
295-
HandoffsMatch: true,
255+
Title: "session1",
256+
SizeExpected: "M",
257+
Size: "S",
296258
},
297259
},
298-
wantTotalEvals: 1,
299-
wantSizesPassed: 0,
300-
wantSizesTotal: 1,
301-
wantHandoffs: 1,
302-
wantHandoffsTotal: 1,
260+
wantTotalEvals: 1,
261+
wantSizesPassed: 0,
262+
wantSizesTotal: 1,
303263
},
304264
{
305265
name: "multiple sessions",
306266
results: []Result{
307-
{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", HandoffsMatch: true},
308-
{Title: "session2", Cost: 0.02, SizeExpected: "L", Size: "S", HandoffsMatch: false},
309-
{Title: "session3", Cost: 0.03, HandoffsMatch: true},
267+
{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M"},
268+
{Title: "session2", Cost: 0.02, SizeExpected: "L", Size: "S"},
269+
{Title: "session3", Cost: 0.03},
310270
},
311-
wantTotalCost: 0.06,
312-
wantTotalEvals: 3,
313-
wantSizesPassed: 1,
314-
wantSizesTotal: 2,
315-
wantHandoffs: 2,
316-
wantHandoffsTotal: 3,
271+
wantTotalCost: 0.06,
272+
wantTotalEvals: 3,
273+
wantSizesPassed: 1,
274+
wantSizesTotal: 2,
317275
},
318276
{
319277
name: "errored results excluded from totals",
320278
results: []Result{
321-
{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", HandoffsMatch: true, RelevanceExpected: 2, RelevancePassed: 2},
279+
{Title: "session1", Cost: 0.01, SizeExpected: "M", Size: "M", RelevanceExpected: 2, RelevancePassed: 2},
322280
{Title: "session2", Cost: 0.02, Error: "docker build failed", SizeExpected: "L", RelevanceExpected: 2},
323281
{Title: "session3", Cost: 0.00, Error: "timeout", RelevanceExpected: 3},
324282
},
325283
wantTotalCost: 0.03, // cost is still counted
326284
wantTotalEvals: 3,
327285
wantSizesPassed: 1,
328286
wantSizesTotal: 1, // only non-errored results count
329-
wantHandoffs: 1,
330-
wantHandoffsTotal: 1, // only non-errored results count
331287
wantRelevance: 2,
332288
wantRelevanceTotal: 2, // only non-errored results count
333289
},
@@ -341,8 +297,6 @@ func TestComputeSummary(t *testing.T) {
341297
assert.InDelta(t, tt.wantTotalCost, summary.TotalCost, 0.0001)
342298
assert.Equal(t, tt.wantSizesPassed, summary.SizesPassed)
343299
assert.Equal(t, tt.wantSizesTotal, summary.SizesTotal)
344-
assert.Equal(t, tt.wantHandoffs, summary.HandoffsPassed)
345-
assert.Equal(t, tt.wantHandoffsTotal, summary.HandoffsTotal)
346300
assert.InDelta(t, tt.wantRelevance, summary.RelevancePassed, 0.0001)
347301
assert.InDelta(t, tt.wantRelevanceTotal, summary.RelevanceTotal, 0.0001)
348302
})
@@ -377,14 +331,12 @@ func TestSaveRunJSON(t *testing.T) {
377331
Timestamp: time.Date(2024, 1, 15, 10, 30, 0, 0, time.UTC),
378332
Duration: 5 * time.Minute,
379333
Results: []Result{
380-
{Title: "test1", Cost: 0.01, HandoffsMatch: true},
334+
{Title: "test1", Cost: 0.01},
381335
{Title: "test2", Cost: 0.02, Error: "failed"},
382336
},
383337
Summary: Summary{
384-
TotalEvals: 2,
385-
TotalCost: 0.03,
386-
HandoffsPassed: 1,
387-
HandoffsTotal: 1,
338+
TotalEvals: 2,
339+
TotalCost: 0.03,
388340
},
389341
}
390342

@@ -581,15 +533,12 @@ func TestPrintSummary(t *testing.T) {
581533
TotalEvals: 10,
582534
FailedEvals: 5,
583535
TotalCost: 0.05,
584-
HandoffsPassed: 3,
585-
HandoffsTotal: 5,
586536
RelevancePassed: 8,
587537
RelevanceTotal: 10,
588538
},
589539
duration: 2 * time.Minute,
590540
wantContains: []string{
591541
"Errors: 5/10 evaluations failed",
592-
"Handoffs: 3/5 passed",
593542
"Relevance: 8/10 passed",
594543
"Total Cost: $0.050000",
595544
"Total Time: 2m0s",
@@ -602,15 +551,12 @@ func TestPrintSummary(t *testing.T) {
602551
TotalCost: 0.1,
603552
SizesPassed: 4,
604553
SizesTotal: 5,
605-
HandoffsPassed: 5,
606-
HandoffsTotal: 5,
607554
RelevancePassed: 10,
608555
RelevanceTotal: 10,
609556
},
610557
duration: 1 * time.Minute,
611558
wantContains: []string{
612559
"Sizes: 4/5 passed",
613-
"Handoffs: 5/5 passed",
614560
"Relevance: 10/10 passed",
615561
"Total Cost: $0.100000",
616562
},
@@ -683,14 +629,12 @@ func TestProgressBarPrintResult(t *testing.T) {
683629
{
684630
name: "successful result",
685631
result: Result{
686-
Title: "test-session",
687-
Cost: 0.005,
688-
HandoffsMatch: true,
632+
Title: "test-session",
633+
Cost: 0.005,
689634
},
690635
wantContains: []string{
691636
"✓ test-session",
692637
"$0.005000",
693-
"✓ handoffs",
694638
},
695639
},
696640
{
@@ -712,14 +656,12 @@ func TestProgressBarPrintResult(t *testing.T) {
712656
Cost: 0.01,
713657
SizeExpected: "M",
714658
Size: "S",
715-
HandoffsMatch: true,
716659
RelevanceExpected: 2,
717660
RelevancePassed: 1,
718661
FailedRelevance: []RelevanceResult{{Criterion: "check failed", Reason: "did not meet criteria"}},
719662
},
720663
wantContains: []string{
721664
"✗ mixed-session", // overall failed
722-
"✓ handoffs",
723665
"✗ size expected M, got S",
724666
"✗ relevance: check failed (reason: did not meet criteria)",
725667
},

pkg/evaluation/scoring.go

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -62,16 +62,6 @@ func countStrings(strs []string) map[string]int {
6262
return counts
6363
}
6464

65-
func countHandoffs(toolCalls []string) int {
66-
count := 0
67-
for _, name := range toolCalls {
68-
if name == "handoff" {
69-
count++
70-
}
71-
}
72-
return count
73-
}
74-
7565
func computeSummary(results []Result) Summary {
7666
summary := Summary{
7767
TotalEvals: len(results),
@@ -96,11 +86,6 @@ func computeSummary(results []Result) Summary {
9686
summary.ToolsCount++
9787
}
9888

99-
summary.HandoffsTotal++
100-
if r.HandoffsMatch {
101-
summary.HandoffsPassed++
102-
}
103-
10489
summary.RelevanceTotal += r.RelevanceExpected
10590
summary.RelevancePassed += r.RelevancePassed
10691
}
@@ -118,7 +103,6 @@ func printSummary(out io.Writer, summary Summary, duration time.Duration) {
118103

119104
printMetric(out, "Sizes", summary.SizesPassed, summary.SizesTotal)
120105
printF1Score(out, "Tool Calls", summary.ToolsF1Sum, summary.ToolsCount)
121-
printMetric(out, "Handoffs", summary.HandoffsPassed, summary.HandoffsTotal)
122106
printMetric(out, "Relevance", int(summary.RelevancePassed), int(summary.RelevanceTotal))
123107

124108
fmt.Fprintf(out, "\nTotal Cost: $%.6f\n", summary.TotalCost)

pkg/evaluation/types.go

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ type Result struct {
2525
SizeExpected string `json:"size_expected"`
2626
ToolCallsScore float64 `json:"tool_calls_score"`
2727
ToolCallsExpected float64 `json:"tool_calls_score_expected"`
28-
HandoffsMatch bool `json:"handoffs"`
2928
RelevancePassed float64 `json:"relevance"`
3029
RelevanceExpected float64 `json:"relevance_expected"`
3130
FailedRelevance []RelevanceResult `json:"failed_relevance,omitempty"`
@@ -58,13 +57,6 @@ func (r *Result) checkResults() (successes, failures []string) {
5857
}
5958
}
6059

61-
// Check handoffs
62-
if r.HandoffsMatch {
63-
successes = append(successes, "handoffs")
64-
} else {
65-
failures = append(failures, "handoffs mismatch")
66-
}
67-
6860
// Check relevance
6961
if r.RelevanceExpected > 0 {
7062
if r.RelevancePassed >= r.RelevanceExpected {
@@ -92,8 +84,6 @@ type Summary struct {
9284
SizesTotal int `json:"sizes_total"`
9385
ToolsF1Sum float64 `json:"tools_f1_sum"`
9486
ToolsCount int `json:"tools_count"`
95-
HandoffsPassed int `json:"handoffs_passed"`
96-
HandoffsTotal int `json:"handoffs_total"`
9787
RelevancePassed float64 `json:"relevance_passed"`
9888
RelevanceTotal float64 `json:"relevance_total"`
9989
}

0 commit comments

Comments
 (0)