Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 151 additions & 1 deletion js/ragas.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import { expect, test } from "vitest";
import { http, HttpResponse } from "msw";
import { setupServer } from "msw/node";
import { OpenAI } from "openai";
import { afterAll, afterEach, beforeAll, describe, expect, test } from "vitest";
import {
AnswerCorrectness,
AnswerRelevancy,
Expand All @@ -9,6 +12,7 @@ import {
ContextRelevancy,
Faithfulness,
} from "./ragas";
import { init } from "./oai";

const data = {
input: "Can starred docs from different workspaces be accessed in one place?",
Expand Down Expand Up @@ -84,3 +88,149 @@ test("Ragas end-to-end test", async () => {
}
}
}, 600000);

// Tests for ContextRelevancy score clamping (#80)
describe("ContextRelevancy score clamping", () => {
const server = setupServer();

beforeAll(() => {
server.listen({
onUnhandledRequest: (req) => {
throw new Error(`Unhandled request ${req.method}, ${req.url}`);
},
});
});

afterEach(() => {
server.resetHandlers();
init();
});

afterAll(() => {
server.close();
});

test("clamps score to 1.0 when LLM returns sentences longer than context", async () => {
// Mock response where extracted sentences are LONGER than the context
// This would produce a raw score > 1.0 without clamping
server.use(
http.post("https://api.openai.com/v1/chat/completions", () => {
return HttpResponse.json({
id: "chatcmpl-test",
object: "chat.completion",
created: Date.now(),
model: "gpt-4o",
choices: [
{
index: 0,
message: {
role: "assistant",
content: null,
tool_calls: [
{
id: "call_test",
type: "function",
function: {
name: "extract_sentences",
arguments: JSON.stringify({
sentences: [
{
sentence:
"Hello world, this is a much longer sentence than the original context that was provided",
reasons: ["This is a test reason"],
},
],
}),
},
},
],
},
finish_reason: "tool_calls",
},
],
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
});
}),
);

init({
client: new OpenAI({
apiKey: "test-api-key",
baseURL: "https://api.openai.com/v1",
}),
});

// Short context that would cause score > 1.0 without clamping
const result = await ContextRelevancy({
input: "What is hello?",
output: "Hello world",
context: "Hello world", // 11 chars, but mock returns 88 chars
});

// Score should be clamped to 1.0, not exceed it
expect(result.score).toBe(1);
expect(result.score).toBeLessThanOrEqual(1);
expect(result.score).toBeGreaterThanOrEqual(0);
});

test("returns expected score for normal case", async () => {
const context =
"Hello world, this is a test context with some content that is reasonably long.";

// Mock response where extracted sentences are shorter than the context
server.use(
http.post("https://api.openai.com/v1/chat/completions", () => {
return HttpResponse.json({
id: "chatcmpl-test",
object: "chat.completion",
created: Date.now(),
model: "gpt-4o",
choices: [
{
index: 0,
message: {
role: "assistant",
content: null,
tool_calls: [
{
id: "call_test",
type: "function",
function: {
name: "extract_sentences",
arguments: JSON.stringify({
sentences: [
{ sentence: "Hello world", reasons: ["Test reason"] },
],
}),
},
},
],
},
finish_reason: "tool_calls",
},
],
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
});
}),
);

init({
client: new OpenAI({
apiKey: "test-api-key",
baseURL: "https://api.openai.com/v1",
}),
});

const result = await ContextRelevancy({
input: "What is hello?",
output: "Hello world",
context,
});

// Score should be len("Hello world") / len(context) = 11 / 79 ≈ 0.139
const expectedScore = "Hello world".length / context.length;
expect(result.score).toBeCloseTo(expectedScore, 2);
expect(result.score).toBeLessThanOrEqual(1);
expect(result.score).toBeGreaterThanOrEqual(0);
});
});
9 changes: 6 additions & 3 deletions js/ragas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -183,11 +183,14 @@ export const ContextRelevancy: ScorerWithPartial<string, RagasArgs> =
});

const sentences = relevantSentencesSchema.parse(mustParseArgs(response));
// Clamp score to [0, 1] - the LLM may return sentences longer than the
// original context due to paraphrasing or hallucination (#80)
const rawScore =
sentences.sentences.map((s) => s.sentence).join("").length /
context.length;
return {
name: "ContextRelevancy",
score:
sentences.sentences.map((s) => s.sentence).join("").length /
context.length,
score: Math.min(Math.max(rawScore, 0), 1),
metadata: {
relevantSentences: sentences.sentences,
},
Expand Down
6 changes: 4 additions & 2 deletions py/autoevals/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,10 +320,12 @@ def __init__(self, pairwise_scorer=None, model=DEFAULT_RAGAS_MODEL, client: Clie
def _postprocess(self, context, response):
sentences = json.loads(response["choices"][0]["message"]["tool_calls"][0]["function"]["arguments"])

# Clamp score to [0, 1] - the LLM may return sentences longer than the
# original context due to paraphrasing or hallucination (#80)
raw_score = len("".join([s["sentence"] for s in sentences["sentences"]])) / len(context)
return Score(
name=self._name(),
# Simplify this by just using the string length, rather than the number of sentences.
score=len("".join([s["sentence"] for s in sentences["sentences"]])) / len(context),
score=min(max(raw_score, 0), 1),
metadata={
"relevant_sentences": sentences["sentences"],
},
Expand Down
75 changes: 75 additions & 0 deletions py/autoevals/test_ragas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import asyncio
import json

import pytest

Expand Down Expand Up @@ -44,3 +45,77 @@ def test_ragas_retrieval(metric: OpenAILLMScorer, expected_score: float, is_asyn
pytest.xfail(f"Expected score {expected_score} but got {score}")
else:
raise e


def test_context_relevancy_score_clamping():
"""Test that ContextRelevancy clamps scores to [0, 1] range (#80).

When the LLM returns sentences longer than the original context
(due to paraphrasing or hallucination), the raw score would exceed 1.0.
This test verifies the score is properly clamped.
"""
scorer = ContextRelevancy()

# Short context
context = "Hello world"

# Mock response where extracted sentences are LONGER than the context
# This would produce a raw score > 1.0 without clamping
mock_response = {
"choices": [
{
"message": {
"tool_calls": [
{
"function": {
"arguments": json.dumps(
{
"sentences": [
{
"sentence": "Hello world, this is a much longer sentence than the original context"
}
]
}
)
}
}
]
}
}
]
}

result = scorer._postprocess(context, mock_response)

# Score should be clamped to 1.0, not exceed it
assert result.score == 1.0
assert result.score <= 1.0
assert result.score >= 0.0


def test_context_relevancy_score_normal_case():
"""Test that ContextRelevancy returns expected score for normal case."""
scorer = ContextRelevancy()

context = "Hello world, this is a test context with some content."

# Mock response where extracted sentences are shorter than the context
mock_response = {
"choices": [
{
"message": {
"tool_calls": [
{"function": {"arguments": json.dumps({"sentences": [{"sentence": "Hello world"}]})}}
]
}
}
]
}

result = scorer._postprocess(context, mock_response)

# Score should be len("Hello world") / len(context) = 11 / 54 ≈ 0.204
expected_score = len("Hello world") / len(context)
assert result.score == pytest.approx(expected_score, rel=1e-3)
assert result.score <= 1.0
assert result.score >= 0.0