Skip to content

Commit cf81bd3

Browse files
committed
Add per-eval Docker image override via evals.image property
Allow each eval JSON to specify a custom Docker image through the "image" field in the "evals" object, overriding the global --base-image flag. The image build cache key now includes both workingDir and image to correctly handle different images for the same working directory. Assisted-By: docker-agent
1 parent 0f2fc76 commit cf81bd3

4 files changed

Lines changed: 77 additions & 71 deletions

File tree

pkg/evaluation/Dockerfile.template

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,17 @@
11
# syntax=docker/dockerfile:1
22

3-
FROM docker:dind AS dind
4-
RUN rm -f /usr/local/bin/docker-compose /usr/local/libexec/docker/cli-plugins/docker-compose /usr/local/libexec/docker/cli-plugins/docker-buildx 2>/dev/null || true
3+
FROM alpine:latest
4+
LABEL "io.docker.agent.evals.image"="default"
5+
COPY --from=docker/docker-agent:edge /docker-agent /
56
RUN cat <<-'EOF' >/run.sh
67
#!/usr/bin/env sh
7-
set -euxo pipefail
8-
(
9-
echo "Starting dockerd..."
10-
export TINI_SUBREAPER=1
11-
export DOCKER_DRIVER=vfs
12-
dockerd-entrypoint.sh dockerd &
13-
14-
until docker info > /dev/null 2>&1
15-
do
16-
echo "Waiting for dockerd..."
17-
sleep 1
18-
done
19-
echo "dockerd is ready!"
20-
) >/dev/null 2>&1
21-
8+
set -euo pipefail
229
exec "$@"
2310
EOF
2411
RUN chmod +x /run.sh
25-
26-
FROM scratch
27-
COPY --from=dind / /
28-
COPY --from=docker/docker-agent:edge /docker-agent /
2912
WORKDIR /working_dir
3013
ENV TELEMETRY_ENABLED=false
3114
ENV DOCKER_AGENT_HIDE_TELEMETRY_BANNER=1
3215
ENTRYPOINT ["/run.sh", "/docker-agent", "run", "--exec", "--yolo", "--json"]
3316
{{if .CopyWorkingDir}}COPY . ./
34-
{{end}}
17+
{{end}}

pkg/evaluation/build.go

Lines changed: 36 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ import (
1111
"path/filepath"
1212
"strings"
1313
"text/template"
14+
15+
"github.com/docker/docker-agent/pkg/session"
1416
)
1517

1618
var (
@@ -24,30 +26,38 @@ var (
2426
dockerfileCustomTemplate = template.Must(template.New("DockerfileCustom").Parse(dockerfileCustomTmpl))
2527
)
2628

29+
// imageKey uniquely identifies a Docker image build configuration.
30+
type imageKey struct {
31+
workingDir string
32+
image string
33+
}
34+
35+
// String returns a stable string representation for use as a singleflight key.
36+
func (k imageKey) String() string {
37+
return k.workingDir + "\x00" + k.image
38+
}
39+
2740
// getOrBuildImage returns a cached image ID or builds a new one.
28-
// Images are cached by working directory to avoid redundant builds.
29-
// Concurrent calls for the same working directory are deduplicated
41+
// Concurrent calls for the same (workingDir, image) pair are deduplicated
3042
// using singleflight so that only one build runs at a time per key.
31-
func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string, error) {
43+
func (r *Runner) getOrBuildImage(ctx context.Context, evals *session.EvalCriteria) (string, error) {
44+
key := imageKey{workingDir: evals.WorkingDir, image: evals.Image}
45+
3246
r.imageCacheMu.Lock()
33-
if imageID, ok := r.imageCache[workingDir]; ok {
47+
if imageID, ok := r.imageCache[key]; ok {
3448
r.imageCacheMu.Unlock()
3549
return imageID, nil
3650
}
3751
r.imageCacheMu.Unlock()
3852

39-
// singleflight ensures only one build per working directory runs at a time.
40-
// The cache write inside the callback guarantees the result is available
41-
// before singleflight releases the key, so subsequent callers always
42-
// hit the cache above.
43-
v, err, _ := r.imageBuildGroup.Do(workingDir, func() (any, error) {
44-
imageID, err := r.buildEvalImage(ctx, workingDir)
53+
v, err, _ := r.imageBuildGroup.Do(key.String(), func() (any, error) {
54+
imageID, err := r.buildEvalImage(ctx, evals)
4555
if err != nil {
4656
return "", err
4757
}
4858

4959
r.imageCacheMu.Lock()
50-
r.imageCache[workingDir] = imageID
60+
r.imageCache[key] = imageID
5161
r.imageCacheMu.Unlock()
5262

5363
return imageID, nil
@@ -59,18 +69,28 @@ func (r *Runner) getOrBuildImage(ctx context.Context, workingDir string) (string
5969
return v.(string), nil
6070
}
6171

62-
func (r *Runner) buildEvalImage(ctx context.Context, workingDir string) (string, error) {
72+
// resolveBaseImage returns the effective base image for an eval.
73+
// The per-eval image takes priority over the global --base-image flag.
74+
func (r *Runner) resolveBaseImage(evals *session.EvalCriteria) string {
75+
if evals.Image != "" {
76+
return evals.Image
77+
}
78+
return r.BaseImage
79+
}
80+
81+
// buildEvalImage builds a Docker image for an evaluation.
82+
func (r *Runner) buildEvalImage(ctx context.Context, evals *session.EvalCriteria) (string, error) {
6383
var buildContext string
6484
var data struct {
6585
CopyWorkingDir bool
6686
BaseImage string
6787
}
6888

69-
if workingDir == "" {
89+
if evals.WorkingDir == "" {
7090
buildContext = r.EvalsDir
7191
data.CopyWorkingDir = false
7292
} else {
73-
buildContext = filepath.Join(r.EvalsDir, "working_dirs", workingDir)
93+
buildContext = filepath.Join(r.EvalsDir, "working_dirs", evals.WorkingDir)
7494
if _, err := os.Stat(buildContext); os.IsNotExist(err) {
7595
return "", fmt.Errorf("working directory not found: %s", buildContext)
7696
}
@@ -79,9 +99,9 @@ func (r *Runner) buildEvalImage(ctx context.Context, workingDir string) (string,
7999

80100
// Choose template based on whether a custom base image is provided
81101
tmpl := dockerfileTemplate
82-
if r.BaseImage != "" {
102+
if baseImage := r.resolveBaseImage(evals); baseImage != "" {
83103
tmpl = dockerfileCustomTemplate
84-
data.BaseImage = r.BaseImage
104+
data.BaseImage = baseImage
85105
}
86106

87107
var dockerfile bytes.Buffer

pkg/evaluation/eval.go

Lines changed: 35 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,11 @@ type Runner struct {
3636
judge *Judge
3737
runConfig *config.RuntimeConfig
3838

39-
// imageCache caches built Docker images by working directory.
40-
// Key is the working directory (empty string for no working dir).
41-
imageCache map[string]string
39+
// imageCache caches built Docker images by (workingDir, image) pair.
40+
imageCache map[imageKey]string
4241
imageCacheMu sync.Mutex
4342

44-
// imageBuildGroup deduplicates concurrent image builds for the same working directory.
43+
// imageBuildGroup deduplicates concurrent image builds for the same (workingDir, image) pair.
4544
imageBuildGroup singleflight.Group
4645
}
4746

@@ -56,7 +55,7 @@ func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judge
5655
agentSource: agentSource,
5756
judge: judge,
5857
runConfig: runConfig,
59-
imageCache: make(map[string]string),
58+
imageCache: make(map[imageKey]string),
6059
}
6160
}
6261

@@ -230,63 +229,68 @@ func (r *Runner) loadEvalSessions(ctx context.Context) ([]InputSession, error) {
230229
}
231230

232231
// preBuildImages pre-builds all unique Docker images needed for the evaluations.
233-
// This is done in parallel to avoid serialized builds during evaluation.
232+
// Concurrent calls for the same (workingDir, image) pair are deduplicated by
233+
// getOrBuildImage's singleflight, so we simply iterate over all evals.
234234
func (r *Runner) preBuildImages(ctx context.Context, out io.Writer, evals []InputSession) error {
235-
// Collect unique working directories
236-
workingDirs := make(map[string]struct{})
235+
if len(evals) == 0 {
236+
return nil
237+
}
238+
239+
// Count unique images to report an accurate number.
240+
unique := make(map[imageKey]struct{})
237241
for _, eval := range evals {
242+
var key imageKey
238243
if eval.Evals != nil {
239-
workingDirs[eval.Evals.WorkingDir] = struct{}{}
244+
key = imageKey{workingDir: eval.Evals.WorkingDir, image: eval.Evals.Image}
240245
}
246+
unique[key] = struct{}{}
241247
}
242248

243-
if len(workingDirs) == 0 {
244-
return nil
245-
}
246-
247-
fmt.Fprintf(out, "Pre-building %d Docker image(s)...\n", len(workingDirs))
249+
fmt.Fprintf(out, "Pre-building %d Docker image(s)...\n", len(unique))
248250

249-
// Build images in parallel with limited concurrency
250251
type buildResult struct {
251-
workingDir string
252-
err error
252+
title string
253+
err error
253254
}
254255

255-
work := make(chan string, len(workingDirs))
256-
for wd := range workingDirs {
257-
work <- wd
256+
work := make(chan InputSession, len(evals))
257+
for _, eval := range evals {
258+
work <- eval
258259
}
259260
close(work)
260261

261-
results := make(chan buildResult, len(workingDirs))
262+
results := make(chan buildResult, len(evals))
262263

263-
// Use same concurrency as evaluation runs for image builds
264-
buildWorkers := min(r.Concurrency, len(workingDirs))
264+
buildWorkers := min(r.Concurrency, len(evals))
265265
var wg sync.WaitGroup
266266
for range buildWorkers {
267267
wg.Go(func() {
268-
for wd := range work {
268+
for eval := range work {
269269
if ctx.Err() != nil {
270-
results <- buildResult{workingDir: wd, err: ctx.Err()}
270+
results <- buildResult{title: eval.Title, err: ctx.Err()}
271271
continue
272272
}
273-
_, err := r.getOrBuildImage(ctx, wd)
274-
results <- buildResult{workingDir: wd, err: err}
273+
274+
criteria := eval.Evals
275+
if criteria == nil {
276+
criteria = &session.EvalCriteria{}
277+
}
278+
279+
_, err := r.getOrBuildImage(ctx, criteria)
280+
results <- buildResult{title: eval.Title, err: err}
275281
}
276282
})
277283
}
278284

279-
// Wait for all builds to complete
280285
go func() {
281286
wg.Wait()
282287
close(results)
283288
}()
284289

285-
// Collect errors
286290
var errs []error
287291
for result := range results {
288292
if result.err != nil {
289-
errs = append(errs, fmt.Errorf("building image for %q: %w", result.workingDir, result.err))
293+
errs = append(errs, fmt.Errorf("building image for %q: %w", result.title, result.err))
290294
}
291295
}
292296

@@ -323,9 +327,7 @@ func (r *Runner) runSingleEval(ctx context.Context, evalSess *InputSession) (Res
323327
result.ToolCallsExpected = 1.0
324328
}
325329

326-
workingDir := evals.WorkingDir
327-
328-
imageID, err := r.getOrBuildImage(ctx, workingDir)
330+
imageID, err := r.getOrBuildImage(ctx, evals)
329331
if err != nil {
330332
return result, fmt.Errorf("building eval image: %w", err)
331333
}

pkg/session/session.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,7 @@ type EvalCriteria struct {
222222
WorkingDir string `json:"working_dir,omitempty"` // Subdirectory under evals/working_dirs/
223223
Size string `json:"size,omitempty"` // Expected response size: S, M, L, XL
224224
Setup string `json:"setup,omitempty"` // Optional sh script to run in the container before docker agent run --exec
225+
Image string `json:"image,omitempty"` // Custom Docker image for this eval (overrides --base-image)
225226
}
226227

227228
// UnmarshalJSON implements custom JSON unmarshaling for EvalCriteria that

0 commit comments

Comments
 (0)