@@ -82,15 +82,41 @@ func NewJudge(model provider.Provider, runConfig *config.RuntimeConfig, concurre
8282 }
8383}
8484
85+ // Validate performs an end-to-end check of the judge model by sending a
86+ // trivial relevance prompt and verifying the response is valid structured
87+ // JSON. This catches configuration errors (bad API key, unsupported model,
88+ // missing structured-output support, etc.) before running any evaluations,
89+ // allowing the framework to fail fast.
90+ func (j * Judge ) Validate (ctx context.Context ) error {
91+ const (
92+ testResponse = "The sky is blue."
93+ testCriterion = "The response mentions a color."
94+ )
95+
96+ passed , _ , err := j .checkSingle (ctx , testResponse , testCriterion )
97+ if err != nil {
98+ return fmt .Errorf ("judge model validation failed: %w" , err )
99+ }
100+
101+ if ! passed {
102+ return errors .New ("judge model validation failed: expected the test criterion to pass but the judge returned 'fail'" )
103+ }
104+
105+ return nil
106+ }
107+
85108// RelevanceResult contains the result of a single relevance check.
86109type RelevanceResult struct {
87110 Criterion string `json:"criterion"`
88111 Reason string `json:"reason"`
89112}
90113
91114// CheckRelevance runs all relevance checks concurrently with the configured concurrency.
92- // It returns the number of passed checks, a slice of failed results with reasons, and any errors encountered.
93- func (j * Judge ) CheckRelevance (ctx context.Context , response string , criteria []string ) (passed int , failed []RelevanceResult , errs []string ) {
115+ // It returns the number of passed checks, a slice of failed results with reasons, and an error
116+ // if any check encountered an error (e.g. judge model misconfiguration). Errors cause a hard
117+ // failure so that configuration issues are surfaced immediately rather than silently producing
118+ // zero-relevance results.
119+ func (j * Judge ) CheckRelevance (ctx context.Context , response string , criteria []string ) (passed int , failed []RelevanceResult , err error ) {
94120 if len (criteria ) == 0 {
95121 return 0 , nil , nil
96122 }
@@ -122,17 +148,19 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
122148 results [item .index ] = result {err : fmt .Errorf ("context cancelled: %w" , ctx .Err ())}
123149 continue
124150 }
125- pass , reason , err := j .checkSingle (ctx , response , item .criterion )
126- results [item .index ] = result {passed : pass , reason : reason , err : err }
151+ pass , reason , checkErr := j .checkSingle (ctx , response , item .criterion )
152+ results [item .index ] = result {passed : pass , reason : reason , err : checkErr }
127153 }
128154 })
129155 }
130156 wg .Wait ()
131157
132- // Aggregate results
158+ // Aggregate results. Any error is fatal — return it immediately so the
159+ // caller can fail fast on judge misconfiguration.
160+ var errs []error
133161 for i , r := range results {
134162 if r .err != nil {
135- errs = append (errs , fmt .Sprintf ( "error checking %q: %v " , criteria [i ], r .err ))
163+ errs = append (errs , fmt .Errorf ( " checking %q: %w " , criteria [i ], r .err ))
136164 continue
137165 }
138166 if r .passed {
@@ -145,7 +173,11 @@ func (j *Judge) CheckRelevance(ctx context.Context, response string, criteria []
145173 }
146174 }
147175
148- return passed , failed , errs
176+ if len (errs ) > 0 {
177+ return passed , failed , errors .Join (errs ... )
178+ }
179+
180+ return passed , failed , nil
149181}
150182
151183// getOrCreateJudgeWithSchema returns a provider pre-configured with structured output.
0 commit comments