Skip to content

Commit 7623ff1

Browse files
authored
feat(eval): add evaluate_full_response option to rubric-based evaluation (#5316)
1 parent 4c0c6db commit 7623ff1

4 files changed

Lines changed: 102 additions & 2 deletions

File tree

src/google/adk/evaluation/eval_metrics.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,19 @@ class BaseCriterion(BaseModel):
115115
description="The threshold to be used by the metric.",
116116
)
117117

118+
include_intermediate_responses_in_final: bool = Field(
119+
default=False,
120+
description=(
121+
"Whether to evaluate the full agent response including intermediate"
122+
" natural language text (e.g. text emitted before tool calls) in"
123+
" addition to the final response. By default, only the final"
124+
" response text is sent to the judge. When True, text from all"
125+
" intermediate invocation events is concatenated with the final"
126+
" response before evaluation. This is useful for agents that emit"
127+
" text both before and after tool calls within a single invocation."
128+
),
129+
)
130+
118131

119132
class LlmAsAJudgeCriterion(BaseCriterion):
120133
"""Criterion when using LLM-As-A-Judge metric."""

src/google/adk/evaluation/llm_as_judge_utils.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from .common import EvalBaseModel
2727
from .eval_case import get_all_tool_calls_with_responses
2828
from .eval_case import IntermediateDataType
29+
from .eval_case import Invocation
30+
from .eval_case import InvocationEvents
2931
from .eval_metrics import RubricScore
3032
from .evaluator import EvalStatus
3133

@@ -44,8 +46,38 @@ class Label(enum.Enum):
4446

4547

4648
def get_text_from_content(
47-
content: Optional[genai_types.Content],
49+
content: Optional[Union[genai_types.Content, Invocation]],
50+
*,
51+
include_intermediate_responses_in_final: bool = False,
4852
) -> Optional[str]:
53+
"""Extracts text from a `Content` or an `Invocation`.
54+
55+
When `content` is a `Content`, returns the concatenated text of its parts.
56+
57+
When `content` is an `Invocation`, returns the text of the invocation's final
58+
response. If `include_intermediate_responses_in_final` is True, text from
59+
intermediate invocation events (e.g. natural language emitted before tool
60+
calls) is concatenated with the final response text.
61+
"""
62+
if isinstance(content, Invocation):
63+
if not include_intermediate_responses_in_final:
64+
# Flag off: revert to basic plain-Content behavior.
65+
return get_text_from_content(content.final_response)
66+
67+
parts: list[str] = []
68+
if isinstance(content.intermediate_data, InvocationEvents):
69+
# Walk intermediate events in order; collect text parts.
70+
for event in content.intermediate_data.invocation_events:
71+
text = get_text_from_content(event.content)
72+
if text:
73+
parts.append(text)
74+
# Then fetch the final response text and append it to the end.
75+
final_text = get_text_from_content(content.final_response)
76+
if final_text:
77+
parts.append(final_text)
78+
79+
return "\n".join(parts) if parts else None
80+
4981
if content and content.parts:
5082
return "\n".join([p.text for p in content.parts if p.text])
5183

src/google/adk/evaluation/rubric_based_final_response_quality_v1.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,18 @@ def format_auto_rater_prompt(
274274
"""Returns the autorater prompt."""
275275
self.create_effective_rubrics_list(actual_invocation.rubrics)
276276
user_input = get_text_from_content(actual_invocation.user_content)
277-
final_response = get_text_from_content(actual_invocation.final_response)
277+
278+
criterion = self._eval_metric.criterion
279+
include_intermediate = getattr(
280+
criterion, "include_intermediate_responses_in_final", False
281+
)
282+
final_response = (
283+
get_text_from_content(
284+
actual_invocation,
285+
include_intermediate_responses_in_final=include_intermediate,
286+
)
287+
or ""
288+
)
278289

279290
rubrics_text = "\n".join([
280291
f"* {r.rubric_content.text_property}"

tests/unittests/evaluation/test_llm_as_judge_utils.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from google.adk.evaluation.app_details import AgentDetails
2020
from google.adk.evaluation.app_details import AppDetails
2121
from google.adk.evaluation.eval_case import IntermediateData
22+
from google.adk.evaluation.eval_case import Invocation
2223
from google.adk.evaluation.eval_case import InvocationEvent
2324
from google.adk.evaluation.eval_case import InvocationEvents
2425
from google.adk.evaluation.eval_rubrics import RubricScore
@@ -88,6 +89,49 @@ def test_get_text_from_content_with_mixed_parts():
8889
assert get_text_from_content(content) == "Hello\nWorld"
8990

9091

92+
def test_get_text_from_content_with_invocation_include_intermediate_responses_in_final():
93+
"""Tests get_text_from_content on an Invocation with and without the flag."""
94+
intermediate_text = "Let me check."
95+
final_response_text = "Done."
96+
invocation = Invocation(
97+
user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
98+
intermediate_data=InvocationEvents(
99+
invocation_events=[
100+
InvocationEvent(
101+
author="agent",
102+
content=genai_types.Content(
103+
parts=[genai_types.Part(text=intermediate_text)]
104+
),
105+
),
106+
InvocationEvent(
107+
author="tool",
108+
content=genai_types.Content(
109+
parts=[
110+
genai_types.Part(
111+
function_call=genai_types.FunctionCall(name="t")
112+
)
113+
]
114+
),
115+
),
116+
]
117+
),
118+
final_response=genai_types.Content(
119+
parts=[genai_types.Part(text=final_response_text)]
120+
),
121+
)
122+
123+
# Flag off (default): only the final response text is returned.
124+
assert get_text_from_content(invocation) == final_response_text
125+
126+
# Flag on: intermediate text is concatenated before the final response.
127+
assert (
128+
get_text_from_content(
129+
invocation, include_intermediate_responses_in_final=True
130+
)
131+
== f"{intermediate_text}\n{final_response_text}"
132+
)
133+
134+
91135
def test_get_eval_status_with_none_score():
92136
"""Tests get_eval_status returns NOT_EVALUATED for a None score."""
93137
assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED

0 commit comments

Comments
 (0)