feat(eval): add evaluate_full_response option to rubric-based evaluation (#5316)

Siddhartha90 · web-flow · commit 7623ff1a27c4 · 2026-04-17T18:37:20.000-07:00
diff --git a/src/google/adk/evaluation/eval_metrics.py b/src/google/adk/evaluation/eval_metrics.py
@@ -115,6 +115,19 @@ class BaseCriterion(BaseModel):
       description="The threshold to be used by the metric.",
   )
 
+  include_intermediate_responses_in_final: bool = Field(
+      default=False,
+      description=(
+          "Whether to evaluate the full agent response including intermediate"
+          " natural language text (e.g. text emitted before tool calls) in"
+          " addition to the final response. By default, only the final"
+          " response text is sent to the judge. When True, text from all"
+          " intermediate invocation events is concatenated with the final"
+          " response before evaluation. This is useful for agents that emit"
+          " text both before and after tool calls within a single invocation."
+      ),
+  )
+
 
 class LlmAsAJudgeCriterion(BaseCriterion):
   """Criterion when using LLM-As-A-Judge metric."""
diff --git a/src/google/adk/evaluation/llm_as_judge_utils.py b/src/google/adk/evaluation/llm_as_judge_utils.py
@@ -26,6 +26,8 @@
 from .common import EvalBaseModel
 from .eval_case import get_all_tool_calls_with_responses
 from .eval_case import IntermediateDataType
+from .eval_case import Invocation
+from .eval_case import InvocationEvents
 from .eval_metrics import RubricScore
 from .evaluator import EvalStatus
 
@@ -44,8 +46,38 @@ class Label(enum.Enum):
 
 
 def get_text_from_content(
-    content: Optional[genai_types.Content],
+    content: Optional[Union[genai_types.Content, Invocation]],
+    *,
+    include_intermediate_responses_in_final: bool = False,
 ) -> Optional[str]:
+  """Extracts text from a `Content` or an `Invocation`.
+
+  When `content` is a `Content`, returns the concatenated text of its parts.
+
+  When `content` is an `Invocation`, returns the text of the invocation's final
+  response. If `include_intermediate_responses_in_final` is True, text from
+  intermediate invocation events (e.g. natural language emitted before tool
+  calls) is concatenated with the final response text.
+  """
+  if isinstance(content, Invocation):
+    if not include_intermediate_responses_in_final:
+      # Flag off: revert to basic plain-Content behavior.
+      return get_text_from_content(content.final_response)
+
+    parts: list[str] = []
+    if isinstance(content.intermediate_data, InvocationEvents):
+      # Walk intermediate events in order; collect text parts.
+      for event in content.intermediate_data.invocation_events:
+        text = get_text_from_content(event.content)
+        if text:
+          parts.append(text)
+    # Then fetch the final response text and append it to the end.
+    final_text = get_text_from_content(content.final_response)
+    if final_text:
+      parts.append(final_text)
+
+    return "\n".join(parts) if parts else None
+
   if content and content.parts:
     return "\n".join([p.text for p in content.parts if p.text])
 
diff --git a/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py b/src/google/adk/evaluation/rubric_based_final_response_quality_v1.py
@@ -274,7 +274,18 @@ def format_auto_rater_prompt(
     """Returns the autorater prompt."""
     self.create_effective_rubrics_list(actual_invocation.rubrics)
     user_input = get_text_from_content(actual_invocation.user_content)
-    final_response = get_text_from_content(actual_invocation.final_response)
+
+    criterion = self._eval_metric.criterion
+    include_intermediate = getattr(
+        criterion, "include_intermediate_responses_in_final", False
+    )
+    final_response = (
+        get_text_from_content(
+            actual_invocation,
+            include_intermediate_responses_in_final=include_intermediate,
+        )
+        or ""
+    )
 
     rubrics_text = "\n".join([
         f"*  {r.rubric_content.text_property}"
diff --git a/tests/unittests/evaluation/test_llm_as_judge_utils.py b/tests/unittests/evaluation/test_llm_as_judge_utils.py
@@ -19,6 +19,7 @@
 from google.adk.evaluation.app_details import AgentDetails
 from google.adk.evaluation.app_details import AppDetails
 from google.adk.evaluation.eval_case import IntermediateData
+from google.adk.evaluation.eval_case import Invocation
 from google.adk.evaluation.eval_case import InvocationEvent
 from google.adk.evaluation.eval_case import InvocationEvents
 from google.adk.evaluation.eval_rubrics import RubricScore
@@ -88,6 +89,49 @@ def test_get_text_from_content_with_mixed_parts():
   assert get_text_from_content(content) == "Hello\nWorld"
 
 
+def test_get_text_from_content_with_invocation_include_intermediate_responses_in_final():
+  """Tests get_text_from_content on an Invocation with and without the flag."""
+  intermediate_text = "Let me check."
+  final_response_text = "Done."
+  invocation = Invocation(
+      user_content=genai_types.Content(parts=[genai_types.Part(text="user")]),
+      intermediate_data=InvocationEvents(
+          invocation_events=[
+              InvocationEvent(
+                  author="agent",
+                  content=genai_types.Content(
+                      parts=[genai_types.Part(text=intermediate_text)]
+                  ),
+              ),
+              InvocationEvent(
+                  author="tool",
+                  content=genai_types.Content(
+                      parts=[
+                          genai_types.Part(
+                              function_call=genai_types.FunctionCall(name="t")
+                          )
+                      ]
+                  ),
+              ),
+          ]
+      ),
+      final_response=genai_types.Content(
+          parts=[genai_types.Part(text=final_response_text)]
+      ),
+  )
+
+  # Flag off (default): only the final response text is returned.
+  assert get_text_from_content(invocation) == final_response_text
+
+  # Flag on: intermediate text is concatenated before the final response.
+  assert (
+      get_text_from_content(
+          invocation, include_intermediate_responses_in_final=True
+      )
+      == f"{intermediate_text}\n{final_response_text}"
+  )
+
+
 def test_get_eval_status_with_none_score():
   """Tests get_eval_status returns NOT_EVALUATED for a None score."""
   assert get_eval_status(score=None, threshold=0.5) == EvalStatus.NOT_EVALUATED