Remove duplication around llm as a judge

dgageot · dgageot · commit f11de8a32211 · 2026-03-13T20:49:23.000+01:00
Signed-off-by: David Gageot &lt;david.gageot@docker.com&gt;
diff --git a/pkg/evaluation/eval.go b/pkg/evaluation/eval.go
@@ -49,7 +49,7 @@ type Runner struct {
 func newRunner(agentSource config.Source, runConfig *config.RuntimeConfig, judgeModel provider.Provider, cfg Config) *Runner {
 	var judge *Judge
 	if judgeModel != nil {
-		judge = NewJudge(judgeModel, runConfig, cfg.Concurrency)
+		judge = NewJudge(judgeModel, cfg.Concurrency)
 	}
 	return &Runner{
 		Config:      cfg,
@@ -626,6 +626,7 @@ func createJudgeModel(ctx context.Context, judgeModel string, runConfig *config.
 
 	opts := []options.Opt{
 		options.WithThinking(false),
+		options.WithStructuredOutput(judgeResponseSchema),
 	}
 	if runConfig.ModelsGateway != "" {
 		opts = append(opts, options.WithGateway(runConfig.ModelsGateway))
diff --git a/pkg/evaluation/judge.go b/pkg/evaluation/judge.go
@@ -11,10 +11,8 @@ import (
 	"sync"
 
 	"github.com/docker/docker-agent/pkg/chat"
-	"github.com/docker/docker-agent/pkg/config"
 	"github.com/docker/docker-agent/pkg/config/latest"
 	"github.com/docker/docker-agent/pkg/model/provider"
-	"github.com/docker/docker-agent/pkg/model/provider/options"
 )
 
 // relevancePrompt is the prompt template for the judge model to evaluate responses.
@@ -58,26 +56,17 @@ var judgeResponseSchema = &latest.StructuredOutput{
 // Judge runs LLM-as-a-judge relevance checks concurrently.
 type Judge struct {
 	model       provider.Provider
-	runConfig   *config.RuntimeConfig
 	concurrency int
-
-	// judgeWithSchema is a provider pre-configured with structured output.
-	// Created lazily on first use and reused across all relevance checks.
-	// Protected by judgeWithSchemaMu; only cached on success so that
-	// transient errors (e.g. context cancellation) can be retried.
-	judgeWithSchema   provider.Provider
-	judgeWithSchemaMu sync.Mutex
 }
 
 // NewJudge creates a new Judge that runs relevance checks with the given concurrency.
 // Concurrency defaults to 1 if n < 1.
-func NewJudge(model provider.Provider, runConfig *config.RuntimeConfig, concurrency int) *Judge {
+func NewJudge(model provider.Provider, concurrency int) *Judge {
 	if concurrency < 1 {
 		concurrency = 1
 	}
 	return &Judge{
 		model:       model,
-		runConfig:   runConfig,
 		concurrency: concurrency,
 	}
 }
@@ -180,48 +169,13 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
 	return passed, failed, nil
 }
 
-// getOrCreateJudgeWithSchema returns a provider pre-configured with structured output.
-// The provider is created once and reused across all relevance checks.
-// Unlike sync.Once, transient failures (e.g. context cancellation) are not
-// cached, allowing subsequent calls to retry.
-func (j *Judge) getOrCreateJudgeWithSchema(ctx context.Context) (provider.Provider, error) {
-	j.judgeWithSchemaMu.Lock()
-	defer j.judgeWithSchemaMu.Unlock()
-
-	if j.judgeWithSchema != nil {
-		return j.judgeWithSchema, nil
-	}
-
-	opts := []options.Opt{
-		options.WithStructuredOutput(judgeResponseSchema),
-		options.WithThinking(false),
-	}
-	if j.runConfig.ModelsGateway != "" {
-		opts = append(opts, options.WithGateway(j.runConfig.ModelsGateway))
-	}
-
-	modelCfg := j.model.BaseConfig().ModelConfig
-	p, err := provider.New(ctx, &modelCfg, j.runConfig.EnvProvider(), opts...)
-	if err != nil {
-		return nil, err
-	}
-
-	j.judgeWithSchema = p
-	return j.judgeWithSchema, nil
-}
-
 // checkSingle checks a single relevance criterion against the response.
 // It returns whether the check passed, the reason provided by the judge, and any error.
 func (j *Judge) checkSingle(ctx context.Context, response, criterion string) (passed bool, reason string, err error) {
-	judgeWithSchema, err := j.getOrCreateJudgeWithSchema(ctx)
-	if err != nil {
-		return false, "", fmt.Errorf("creating judge provider with structured output: %w", err)
-	}
-
 	prompt := fmt.Sprintf(relevancePrompt, response, criterion)
 	messages := []chat.Message{{Role: chat.MessageRoleUser, Content: prompt}}
 
-	stream, err := judgeWithSchema.CreateChatCompletionStream(ctx, messages, nil)
+	stream, err := j.model.CreateChatCompletionStream(ctx, messages, nil)
 	if err != nil {
 		return false, "", fmt.Errorf("creating chat completion: %w", err)
 	}
diff --git a/pkg/evaluation/judge_test.go b/pkg/evaluation/judge_test.go
@@ -37,7 +37,7 @@ func TestNewJudge(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			t.Parallel()
 
-			judge := NewJudge(nil, nil, tt.concurrency)
+			judge := NewJudge(nil, tt.concurrency)
 			assert.Equal(t, tt.expectedConcurrency, judge.concurrency)
 		})
 	}
@@ -46,7 +46,7 @@ func TestNewJudge(t *testing.T) {
 func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
 	t.Parallel()
 
-	judge := NewJudge(nil, nil, 1)
+	judge := NewJudge(nil, 1)
 	passed, failed, err := judge.CheckRelevance(t.Context(), "some response", nil)
 
 	assert.Equal(t, 0, passed)
@@ -57,7 +57,7 @@ func TestJudge_CheckRelevance_EmptyCriteria(t *testing.T) {
 func TestJudge_CheckRelevance_ContextCanceled(t *testing.T) {
 	t.Parallel()
 
-	judge := NewJudge(nil, nil, 2)
+	judge := NewJudge(nil, 2)
 
 	ctx, cancel := context.WithCancel(t.Context())
 	cancel() // Cancel immediately