Skip to content

Commit f11de8a

Browse files
committed
Remove duplication around llm as a judge
Signed-off-by: David Gageot <david.gageot@docker.com>
1 parent 37ec4d1 commit f11de8a

3 files changed

Lines changed: 7 additions & 52 deletions

File tree

pkg/evaluation/eval.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ type Runner struct {
4949
func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judgeModel provider.Provider, cfg Config) *Runner {
5050
var judge *Judge
5151
if judgeModel != nil {
52-
judge = NewJudge(judgeModel, runConfig, cfg.Concurrency)
52+
judge = NewJudge(judgeModel, cfg.Concurrency)
5353
}
5454
return &Runner{
5555
Config: cfg,
@@ -626,6 +626,7 @@ func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.
626626

627627
opts := []options.Opt{
628628
options.WithThinking(false),
629+
options.WithStructuredOutput(judgeResponseSchema),
629630
}
630631
if runConfig.ModelsGateway != "" {
631632
opts = append(opts, options.WithGateway(runConfig.ModelsGateway))

pkg/evaluation/judge.go

Lines changed: 2 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,8 @@ import (
1111
"sync"
1212

1313
"github.com/docker/docker-agent/pkg/chat"
14-
"github.com/docker/docker-agent/pkg/config"
1514
"github.com/docker/docker-agent/pkg/config/latest"
1615
"github.com/docker/docker-agent/pkg/model/provider"
17-
"github.com/docker/docker-agent/pkg/model/provider/options"
1816
)
1917

2018
// relevancePrompt is the prompt template for the judge model to evaluate responses.
@@ -58,26 +56,17 @@ var judgeResponseSchema = &latest.StructuredOutput{
5856
// Judge runs LLM-as-a-judge relevance checks concurrently.
5957
type Judge struct {
6058
model provider.Provider
61-
runConfig *config.RuntimeConfig
6259
concurrency int
63-
64-
// judgeWithSchema is a provider pre-configured with structured output.
65-
// Created lazily on first use and reused across all relevance checks.
66-
// Protected by judgeWithSchemaMu; only cached on success so that
67-
// transient errors (e.g. context cancellation) can be retried.
68-
judgeWithSchema provider.Provider
69-
judgeWithSchemaMu sync.Mutex
7060
}
7161

7262
// NewJudge creates a new Judge that runs relevance checks with the given concurrency.
7363
// Concurrency defaults to 1 if n < 1.
74-
func NewJudge(model provider.Provider, runConfig *config.RuntimeConfig, concurrency int) *Judge {
64+
func NewJudge(model provider.Provider, concurrency int) *Judge {
7565
if concurrency < 1 {
7666
concurrency = 1
7767
}
7868
return &Judge{
7969
model: model,
80-
runConfig: runConfig,
8170
concurrency: concurrency,
8271
}
8372
}
@@ -180,48 +169,13 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
180169
return passed, failed, nil
181170
}
182171

183-
// getOrCreateJudgeWithSchema returns a provider pre-configured with structured output.
184-
// The provider is created once and reused across all relevance checks.
185-
// Unlike sync.Once, transient failures (e.g. context cancellation) are not
186-
// cached, allowing subsequent calls to retry.
187-
func (j *Judge) getOrCreateJudgeWithSchema(ctx context.Context) (provider.Provider, error) {
188-
j.judgeWithSchemaMu.Lock()
189-
defer j.judgeWithSchemaMu.Unlock()
190-
191-
if j.judgeWithSchema != nil {
192-
return j.judgeWithSchema, nil
193-
}
194-
195-
opts := []options.Opt{
196-
options.WithStructuredOutput(judgeResponseSchema),
197-
options.WithThinking(false),
198-
}
199-
if j.runConfig.ModelsGateway != "" {
200-
opts = append(opts, options.WithGateway(j.runConfig.ModelsGateway))
201-
}
202-
203-
modelCfg := j.model.BaseConfig().ModelConfig
204-
p, err := provider.New(ctx, &modelCfg, j.runConfig.EnvProvider(), opts...)
205-
if err != nil {
206-
return nil, err
207-
}
208-
209-
j.judgeWithSchema = p
210-
return j.judgeWithSchema, nil
211-
}
212-
213172
// checkSingle checks a single relevance criterion against the response.
214173
// It returns whether the check passed, the reason provided by the judge, and any error.
215174
func (j *Judge) checkSingle(ctx context.Context, response, criterion string) (passed bool, reason string, err error) {
216-
judgeWithSchema, err := j.getOrCreateJudgeWithSchema(ctx)
217-
if err != nil {
218-
return false, "", fmt.Errorf("creating judge provider with structured output: %w", err)
219-
}
220-
221175
prompt := fmt.Sprintf(relevancePrompt, response, criterion)
222176
messages := []chat.Message{{Role: chat.MessageRoleUser, Content: prompt}}
223177

224-
stream, err := judgeWithSchema.CreateChatCompletionStream(ctx, messages, nil)
178+
stream, err := j.model.CreateChatCompletionStream(ctx, messages, nil)
225179
if err != nil {
226180
return false, "", fmt.Errorf("creating chat completion: %w", err)
227181
}

pkg/evaluation/judge_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ func TestNewJudge(t *testing.T) {
3737
t.Run(tt.name, func(t *testing.T) {
3838
t.Parallel()
3939

40-
judge := NewJudge(nil, nil, tt.concurrency)
40+
judge := NewJudge(nil, tt.concurrency)
4141
assert.Equal(t, tt.expectedConcurrency, judge.concurrency)
4242
})
4343
}
@@ -46,7 +46,7 @@ func TestNewJudge(t *testing.T) {
4646
func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
4747
t.Parallel()
4848

49-
judge := NewJudge(nil, nil, 1)
49+
judge := NewJudge(nil, 1)
5050
passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil)
5151

5252
assert.Equal(t, 0, passed)
@@ -57,7 +57,7 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
5757
func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) {
5858
t.Parallel()
5959

60-
judge := NewJudge(nil, nil, 2)
60+
judge := NewJudge(nil, 2)
6161

6262
ctx, cancel := context.WithCancel(t.Context())
6363
cancel() // Cancel immediately

0 commit comments

Comments
 (0)