@@ -126,30 +126,6 @@ func TestGetResponseSize(t *testing.T) {
126126 }
127127}
128128
129- func TestCountHandoffs (t * testing.T ) {
130- t .Parallel ()
131-
132- tests := []struct {
133- name string
134- toolCalls []string
135- want int
136- }{
137- {"no tool calls" , []string {}, 0 },
138- {"no handoffs" , []string {"search" , "read_file" }, 0 },
139- {"one handoff" , []string {"handoff" , "read_file" }, 1 },
140- {"one transfer_task" , []string {"transfer_task" , "read_file" }, 0 },
141- {"multiple handoffs" , []string {"handoff" , "transfer_task" , "handoff" }, 2 },
142- }
143-
144- for _ , tt := range tests {
145- t .Run (tt .name , func (t * testing.T ) {
146- t .Parallel ()
147- got := countHandoffs (tt .toolCalls )
148- assert .Equal (t , tt .want , got )
149- })
150- }
151- }
152-
153129func TestParseJudgeResponse (t * testing.T ) {
154130 t .Parallel ()
155131
@@ -202,32 +178,26 @@ func TestResultCheckResults(t *testing.T) {
202178 },
203179 {
204180 name : "all checks pass" ,
205- result : Result {SizeExpected : "M" , Size : "M" , ToolCallsExpected : 1 , ToolCallsScore : 1.0 , HandoffsMatch : true , RelevanceExpected : 2 , RelevancePassed : 2 },
206- wantSuccess : []string {"size M" , "tool calls" , "handoffs" , " relevance 2/2" },
181+ result : Result {SizeExpected : "M" , Size : "M" , ToolCallsExpected : 1 , ToolCallsScore : 1.0 , RelevanceExpected : 2 , RelevancePassed : 2 },
182+ wantSuccess : []string {"size M" , "tool calls" , "relevance 2/2" },
207183 wantFailures : nil ,
208184 },
209185 {
210186 name : "size mismatch" ,
211- result : Result {SizeExpected : "M" , Size : "S" , HandoffsMatch : true },
212- wantSuccess : [] string { "handoffs" } ,
187+ result : Result {SizeExpected : "M" , Size : "S" },
188+ wantSuccess : nil ,
213189 wantFailures : []string {"size expected M, got S" },
214190 },
215191 {
216192 name : "tool calls failed" ,
217- result : Result {ToolCallsExpected : 1 , ToolCallsScore : 0.5 , HandoffsMatch : true },
218- wantSuccess : []string {"handoffs" },
219- wantFailures : []string {"tool calls score 0.50" },
220- },
221- {
222- name : "handoffs mismatch" ,
223- result : Result {HandoffsMatch : false },
193+ result : Result {ToolCallsExpected : 1 , ToolCallsScore : 0.5 },
224194 wantSuccess : nil ,
225- wantFailures : []string {"handoffs mismatch " },
195+ wantFailures : []string {"tool calls score 0.50 " },
226196 },
227197 {
228198 name : "relevance failures listed" ,
229- result : Result {HandoffsMatch : true , RelevanceExpected : 2 , RelevancePassed : 0 , FailedRelevance : []RelevanceResult {{Criterion : "check A" , Reason : "reason A" }, {Criterion : "check B" , Reason : "reason B" }}},
230- wantSuccess : [] string { "handoffs" } ,
199+ result : Result {RelevanceExpected : 2 , RelevancePassed : 0 , FailedRelevance : []RelevanceResult {{Criterion : "check A" , Reason : "reason A" }, {Criterion : "check B" , Reason : "reason B" }}},
200+ wantSuccess : nil ,
231201 wantFailures : []string {"relevance: check A (reason: reason A)" , "relevance: check B (reason: reason B)" },
232202 },
233203 }
@@ -252,82 +222,68 @@ func TestComputeSummary(t *testing.T) {
252222 wantTotalEvals int
253223 wantSizesPassed int
254224 wantSizesTotal int
255- wantHandoffs int
256- wantHandoffsTotal int
257225 wantRelevance float64
258226 wantRelevanceTotal float64
259227 }{
260228 {
261- name : "no results" ,
262- results : []Result {},
263- wantTotalCost : 0 ,
264- wantTotalEvals : 0 ,
265- wantSizesPassed : 0 ,
266- wantSizesTotal : 0 ,
267- wantHandoffs : 0 ,
268- wantHandoffsTotal : 0 ,
229+ name : "no results" ,
230+ results : []Result {},
231+ wantTotalCost : 0 ,
232+ wantTotalEvals : 0 ,
233+ wantSizesPassed : 0 ,
234+ wantSizesTotal : 0 ,
269235 },
270236 {
271237 name : "all passed" ,
272238 results : []Result {
273239 {
274- Title : "session1" ,
275- Cost : 0.01 ,
276- SizeExpected : "M" ,
277- Size : "M" ,
278- HandoffsMatch : true ,
240+ Title : "session1" ,
241+ Cost : 0.01 ,
242+ SizeExpected : "M" ,
243+ Size : "M" ,
279244 },
280245 },
281- wantTotalCost : 0.01 ,
282- wantTotalEvals : 1 ,
283- wantSizesPassed : 1 ,
284- wantSizesTotal : 1 ,
285- wantHandoffs : 1 ,
286- wantHandoffsTotal : 1 ,
246+ wantTotalCost : 0.01 ,
247+ wantTotalEvals : 1 ,
248+ wantSizesPassed : 1 ,
249+ wantSizesTotal : 1 ,
287250 },
288251 {
289252 name : "size mismatch" ,
290253 results : []Result {
291254 {
292- Title : "session1" ,
293- SizeExpected : "M" ,
294- Size : "S" ,
295- HandoffsMatch : true ,
255+ Title : "session1" ,
256+ SizeExpected : "M" ,
257+ Size : "S" ,
296258 },
297259 },
298- wantTotalEvals : 1 ,
299- wantSizesPassed : 0 ,
300- wantSizesTotal : 1 ,
301- wantHandoffs : 1 ,
302- wantHandoffsTotal : 1 ,
260+ wantTotalEvals : 1 ,
261+ wantSizesPassed : 0 ,
262+ wantSizesTotal : 1 ,
303263 },
304264 {
305265 name : "multiple sessions" ,
306266 results : []Result {
307- {Title : "session1" , Cost : 0.01 , SizeExpected : "M" , Size : "M" , HandoffsMatch : true },
308- {Title : "session2" , Cost : 0.02 , SizeExpected : "L" , Size : "S" , HandoffsMatch : false },
309- {Title : "session3" , Cost : 0.03 , HandoffsMatch : true },
267+ {Title : "session1" , Cost : 0.01 , SizeExpected : "M" , Size : "M" },
268+ {Title : "session2" , Cost : 0.02 , SizeExpected : "L" , Size : "S" },
269+ {Title : "session3" , Cost : 0.03 },
310270 },
311- wantTotalCost : 0.06 ,
312- wantTotalEvals : 3 ,
313- wantSizesPassed : 1 ,
314- wantSizesTotal : 2 ,
315- wantHandoffs : 2 ,
316- wantHandoffsTotal : 3 ,
271+ wantTotalCost : 0.06 ,
272+ wantTotalEvals : 3 ,
273+ wantSizesPassed : 1 ,
274+ wantSizesTotal : 2 ,
317275 },
318276 {
319277 name : "errored results excluded from totals" ,
320278 results : []Result {
321- {Title : "session1" , Cost : 0.01 , SizeExpected : "M" , Size : "M" , HandoffsMatch : true , RelevanceExpected : 2 , RelevancePassed : 2 },
279+ {Title : "session1" , Cost : 0.01 , SizeExpected : "M" , Size : "M" , RelevanceExpected : 2 , RelevancePassed : 2 },
322280 {Title : "session2" , Cost : 0.02 , Error : "docker build failed" , SizeExpected : "L" , RelevanceExpected : 2 },
323281 {Title : "session3" , Cost : 0.00 , Error : "timeout" , RelevanceExpected : 3 },
324282 },
325283 wantTotalCost : 0.03 , // cost is still counted
326284 wantTotalEvals : 3 ,
327285 wantSizesPassed : 1 ,
328286 wantSizesTotal : 1 , // only non-errored results count
329- wantHandoffs : 1 ,
330- wantHandoffsTotal : 1 , // only non-errored results count
331287 wantRelevance : 2 ,
332288 wantRelevanceTotal : 2 , // only non-errored results count
333289 },
@@ -341,8 +297,6 @@ func TestComputeSummary(t *testing.T) {
341297 assert .InDelta (t , tt .wantTotalCost , summary .TotalCost , 0.0001 )
342298 assert .Equal (t , tt .wantSizesPassed , summary .SizesPassed )
343299 assert .Equal (t , tt .wantSizesTotal , summary .SizesTotal )
344- assert .Equal (t , tt .wantHandoffs , summary .HandoffsPassed )
345- assert .Equal (t , tt .wantHandoffsTotal , summary .HandoffsTotal )
346300 assert .InDelta (t , tt .wantRelevance , summary .RelevancePassed , 0.0001 )
347301 assert .InDelta (t , tt .wantRelevanceTotal , summary .RelevanceTotal , 0.0001 )
348302 })
@@ -377,14 +331,12 @@ func TestSaveRunJSON(t *testing.T) {
377331 Timestamp : time .Date (2024 , 1 , 15 , 10 , 30 , 0 , 0 , time .UTC ),
378332 Duration : 5 * time .Minute ,
379333 Results : []Result {
380- {Title : "test1" , Cost : 0.01 , HandoffsMatch : true },
334+ {Title : "test1" , Cost : 0.01 },
381335 {Title : "test2" , Cost : 0.02 , Error : "failed" },
382336 },
383337 Summary : Summary {
384- TotalEvals : 2 ,
385- TotalCost : 0.03 ,
386- HandoffsPassed : 1 ,
387- HandoffsTotal : 1 ,
338+ TotalEvals : 2 ,
339+ TotalCost : 0.03 ,
388340 },
389341 }
390342
@@ -581,15 +533,12 @@ func TestPrintSummary(t *testing.T) {
581533 TotalEvals : 10 ,
582534 FailedEvals : 5 ,
583535 TotalCost : 0.05 ,
584- HandoffsPassed : 3 ,
585- HandoffsTotal : 5 ,
586536 RelevancePassed : 8 ,
587537 RelevanceTotal : 10 ,
588538 },
589539 duration : 2 * time .Minute ,
590540 wantContains : []string {
591541 "Errors: 5/10 evaluations failed" ,
592- "Handoffs: 3/5 passed" ,
593542 "Relevance: 8/10 passed" ,
594543 "Total Cost: $0.050000" ,
595544 "Total Time: 2m0s" ,
@@ -602,15 +551,12 @@ func TestPrintSummary(t *testing.T) {
602551 TotalCost : 0.1 ,
603552 SizesPassed : 4 ,
604553 SizesTotal : 5 ,
605- HandoffsPassed : 5 ,
606- HandoffsTotal : 5 ,
607554 RelevancePassed : 10 ,
608555 RelevanceTotal : 10 ,
609556 },
610557 duration : 1 * time .Minute ,
611558 wantContains : []string {
612559 "Sizes: 4/5 passed" ,
613- "Handoffs: 5/5 passed" ,
614560 "Relevance: 10/10 passed" ,
615561 "Total Cost: $0.100000" ,
616562 },
@@ -683,14 +629,12 @@ func TestProgressBarPrintResult(t *testing.T) {
683629 {
684630 name : "successful result" ,
685631 result : Result {
686- Title : "test-session" ,
687- Cost : 0.005 ,
688- HandoffsMatch : true ,
632+ Title : "test-session" ,
633+ Cost : 0.005 ,
689634 },
690635 wantContains : []string {
691636 "✓ test-session" ,
692637 "$0.005000" ,
693- "✓ handoffs" ,
694638 },
695639 },
696640 {
@@ -712,14 +656,12 @@ func TestProgressBarPrintResult(t *testing.T) {
712656 Cost : 0.01 ,
713657 SizeExpected : "M" ,
714658 Size : "S" ,
715- HandoffsMatch : true ,
716659 RelevanceExpected : 2 ,
717660 RelevancePassed : 1 ,
718661 FailedRelevance : []RelevanceResult {{Criterion : "check failed" , Reason : "did not meet criteria" }},
719662 },
720663 wantContains : []string {
721664 "✗ mixed-session" , // overall failed
722- "✓ handoffs" ,
723665 "✗ size expected M, got S" ,
724666 "✗ relevance: check failed (reason: did not meet criteria)" ,
725667 },
0 commit comments