Skip to content

Commit ba7530a

Browse files
authored
Merge pull request #1706 from dgageot/llm-judge
Improve error handling and logging in evaluation judge
2 parents d800706 + 257a597 commit ba7530a

2 files changed

Lines changed: 57 additions & 25 deletions

File tree

pkg/evaluation/eval_test.go

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -156,23 +156,29 @@ func TestParseJudgeResponse(t *testing.T) {
156156
text string
157157
wantPassed bool
158158
wantReason string
159+
wantErr bool
159160
}{
160-
{"simple pass", `{"result": "pass", "reason": "good"}`, true, "good"},
161-
{"simple fail", `{"result": "fail", "reason": "bad"}`, false, "bad"},
162-
{"pass uppercase", `{"result": "PASS", "reason": "good"}`, true, "good"},
163-
{"fail uppercase", `{"result": "FAIL", "reason": "bad"}`, false, "bad"},
164-
{"pass mixed case", `{"result": "Pass", "reason": "good"}`, true, "good"},
165-
{"invalid json returns false", `not json at all`, false, "failed to parse judge response"},
166-
{"empty result returns false", `{"result": "", "reason": "empty"}`, false, "empty"},
167-
{"missing result field", `{"reason": "no result field"}`, false, "no result field"},
161+
{"simple pass", `{"result": "pass", "reason": "good"}`, true, "good", false},
162+
{"simple fail", `{"result": "fail", "reason": "bad"}`, false, "bad", false},
163+
{"pass uppercase", `{"result": "PASS", "reason": "good"}`, true, "good", false},
164+
{"fail uppercase", `{"result": "FAIL", "reason": "bad"}`, false, "bad", false},
165+
{"pass mixed case", `{"result": "Pass", "reason": "good"}`, true, "good", false},
166+
{"invalid json returns error", `not json at all`, false, "", true},
167+
{"empty result returns false", `{"result": "", "reason": "empty"}`, false, "empty", false},
168+
{"missing result field", `{"reason": "no result field"}`, false, "no result field", false},
168169
}
169170

170171
for _, tt := range tests {
171172
t.Run(tt.name, func(t *testing.T) {
172173
t.Parallel()
173-
got := parseJudgeResponse(tt.text)
174-
assert.Equal(t, tt.wantPassed, got.passed)
175-
assert.Equal(t, tt.wantReason, got.reason)
174+
passed, reason, err := parseJudgeResponse(tt.text)
175+
if tt.wantErr {
176+
require.Error(t, err)
177+
} else {
178+
require.NoError(t, err)
179+
assert.Equal(t, tt.wantPassed, passed)
180+
assert.Equal(t, tt.wantReason, reason)
181+
}
176182
})
177183
}
178184
}

pkg/evaluation/judge.go

Lines changed: 40 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,10 @@ package evaluation
33
import (
44
"context"
55
"encoding/json"
6+
"errors"
67
"fmt"
8+
"io"
9+
"log/slog"
710
"strings"
811
"sync"
912

@@ -164,18 +167,42 @@ func (j *Judge) checkSingle(ctx context.Context, response, criterion string) (pa
164167
defer stream.Close()
165168

166169
var fullResponse strings.Builder
170+
var streamErr error
167171
for {
168172
resp, err := stream.Recv()
169173
if err != nil {
174+
if !errors.Is(err, io.EOF) {
175+
streamErr = err
176+
}
170177
break
171178
}
172179
for _, choice := range resp.Choices {
173180
fullResponse.WriteString(choice.Delta.Content)
174181
}
175182
}
176183

177-
result := parseJudgeResponse(fullResponse.String())
178-
return result.passed, result.reason, nil
184+
if streamErr != nil {
185+
return false, "", fmt.Errorf("streaming judge response: %w", streamErr)
186+
}
187+
188+
raw := fullResponse.String()
189+
passed, reason, err = parseJudgeResponse(raw)
190+
if err != nil {
191+
slog.Warn("Failed to parse judge response",
192+
"criterion", criterion,
193+
"raw_response", raw,
194+
"error", err,
195+
)
196+
return false, "", fmt.Errorf("parsing judge response (length=%d): %w", len(raw), err)
197+
}
198+
199+
slog.Debug("Judge response parsed successfully",
200+
"criterion", criterion,
201+
"passed", passed,
202+
"reason", reason,
203+
)
204+
205+
return passed, reason, nil
179206
}
180207

181208
// judgeResponse represents the structured response from the judge model.
@@ -184,23 +211,22 @@ type judgeResponse struct {
184211
Reason string `json:"reason"`
185212
}
186213

187-
// parsedJudgeResult contains the parsed result from the judge.
188-
type parsedJudgeResult struct {
189-
passed bool
190-
reason string
191-
}
192-
193-
func parseJudgeResponse(text string) parsedJudgeResult {
214+
// parseJudgeResponse parses a JSON judge response and returns whether the check
215+
// passed, the reason, and any parse error.
216+
func parseJudgeResponse(text string) (passed bool, reason string, err error) {
194217
text = strings.TrimSpace(text)
195218

196219
var resp judgeResponse
197220
if err := json.Unmarshal([]byte(text), &resp); err != nil {
198-
// With structured output this should not happen, but handle gracefully
199-
return parsedJudgeResult{passed: false, reason: "failed to parse judge response"}
221+
return false, "", fmt.Errorf("invalid JSON: %w", err)
200222
}
201223

202-
return parsedJudgeResult{
203-
passed: strings.EqualFold(resp.Result, "pass"),
204-
reason: resp.Reason,
224+
if resp.Result == "" {
225+
slog.Warn("Judge response has empty result field",
226+
"raw_response", text,
227+
"reason_field", resp.Reason,
228+
)
205229
}
230+
231+
return strings.EqualFold(resp.Result, "pass"), resp.Reason, nil
206232
}

0 commit comments

Comments
 (0)