@@ -36,12 +36,11 @@ type Runner struct {
3636 judge * Judge
3737 runConfig * config.RuntimeConfig
3838
39- // imageCache caches built Docker images by working directory.
40- // Key is the working directory (empty string for no working dir).
41- imageCache map [string ]string
39+ // imageCache caches built Docker images by (workingDir, image) pair.
40+ imageCache map [imageKey ]string
4241 imageCacheMu sync.Mutex
4342
44- // imageBuildGroup deduplicates concurrent image builds for the same working directory .
43+ // imageBuildGroup deduplicates concurrent image builds for the same (workingDir, image) pair .
4544 imageBuildGroup singleflight.Group
4645}
4746
@@ -56,7 +55,7 @@ func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judge
5655 agentSource : agentSource ,
5756 judge : judge ,
5857 runConfig : runConfig ,
59- imageCache : make (map [string ]string ),
58+ imageCache : make (map [imageKey ]string ),
6059 }
6160}
6261
@@ -230,63 +229,68 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
230229}
231230
232231// preBuildImages pre-builds all unique Docker images needed for the evaluations.
233- // This is done in parallel to avoid serialized builds during evaluation.
232+ // Concurrent calls for the same (workingDir, image) pair are deduplicated by
233+ // getOrBuildImage's singleflight, so we simply iterate over all evals.
234234func (r * Runner ) preBuildImages (ctx context.Context , out io.Writer , evals []InputSession ) error {
235- // Collect unique working directories
236- workingDirs := make (map [string ]struct {})
235+ if len (evals ) == 0 {
236+ return nil
237+ }
238+
239+ // Count unique images to report an accurate number.
240+ unique := make (map [imageKey ]struct {})
237241 for _ , eval := range evals {
242+ var key imageKey
238243 if eval .Evals != nil {
239- workingDirs [ eval .Evals .WorkingDir ] = struct {}{ }
244+ key = imageKey { workingDir : eval .Evals .WorkingDir , image : eval . Evals . Image }
240245 }
246+ unique [key ] = struct {}{}
241247 }
242248
243- if len (workingDirs ) == 0 {
244- return nil
245- }
246-
247- fmt .Fprintf (out , "Pre-building %d Docker image(s)...\n " , len (workingDirs ))
249+ fmt .Fprintf (out , "Pre-building %d Docker image(s)...\n " , len (unique ))
248250
249- // Build images in parallel with limited concurrency
250251 type buildResult struct {
251- workingDir string
252- err error
252+ title string
253+ err error
253254 }
254255
255- work := make (chan string , len (workingDirs ))
256- for wd := range workingDirs {
257- work <- wd
256+ work := make (chan InputSession , len (evals ))
257+ for _ , eval := range evals {
258+ work <- eval
258259 }
259260 close (work )
260261
261- results := make (chan buildResult , len (workingDirs ))
262+ results := make (chan buildResult , len (evals ))
262263
263- // Use same concurrency as evaluation runs for image builds
264- buildWorkers := min (r .Concurrency , len (workingDirs ))
264+ buildWorkers := min (r .Concurrency , len (evals ))
265265 var wg sync.WaitGroup
266266 for range buildWorkers {
267267 wg .Go (func () {
268- for wd := range work {
268+ for eval := range work {
269269 if ctx .Err () != nil {
270- results <- buildResult {workingDir : wd , err : ctx .Err ()}
270+ results <- buildResult {title : eval . Title , err : ctx .Err ()}
271271 continue
272272 }
273- _ , err := r .getOrBuildImage (ctx , wd )
274- results <- buildResult {workingDir : wd , err : err }
273+
274+ criteria := eval .Evals
275+ if criteria == nil {
276+ criteria = & session.EvalCriteria {}
277+ }
278+
279+ _ , err := r .getOrBuildImage (ctx , criteria )
280+ results <- buildResult {title : eval .Title , err : err }
275281 }
276282 })
277283 }
278284
279- // Wait for all builds to complete
280285 go func () {
281286 wg .Wait ()
282287 close (results )
283288 }()
284289
285- // Collect errors
286290 var errs []error
287291 for result := range results {
288292 if result .err != nil {
289- errs = append (errs , fmt .Errorf ("building image for %q: %w" , result .workingDir , result .err ))
293+ errs = append (errs , fmt .Errorf ("building image for %q: %w" , result .title , result .err ))
290294 }
291295 }
292296
@@ -323,9 +327,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
323327 result .ToolCallsExpected = 1.0
324328 }
325329
326- workingDir := evals .WorkingDir
327-
328- imageID , err := r .getOrBuildImage (ctx , workingDir )
330+ imageID , err := r .getOrBuildImage (ctx , evals )
329331 if err != nil {
330332 return result , fmt .Errorf ("building eval image: %w" , err )
331333 }
0 commit comments