fix: handle empty response retry for streaming+thinking and no-event cases

gurjot-05 · claude · gurjot-05 · commit 10f9073cdbb6 · 2026-03-26T12:51:04.000+05:30
The original fix only retried when is_final_response() was True with
empty content. This missed two scenarios observed in production:

1. Streaming + thinking: model streams thought chunks (partial=True)
   then stops with no text — the LiteLLM adapter dropped the response
   entirely, and the loop broke on last_event.partial without retry.

2. No events at all: model returned content=None which was filtered by
   _postprocess_async, leaving last_event=None — loop broke immediately.

Changes:
- lite_llm.py: Add fallback after streaming loop to yield an explicit
  empty non-partial LlmResponse when nothing was finalized, so
  downstream retry logic can detect and handle it.
- base_llm_flow.py: Restructure run_async() to check for empty
  responses (None, partial+empty, final+empty) before normal
  termination, enabling retry across all three scenarios.
- Update existing test for new retry-on-None behavior.
- Add 12 comprehensive scenario tests covering all cases.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/google/adk/flows/llm_flows/base_llm_flow.py b/src/google/adk/flows/llm_flows/base_llm_flow.py
@@ -788,39 +788,58 @@ async def run_async(
         async for event in agen:
           last_event = event
           yield event
-      if not last_event or last_event.partial:
+
+      # Determine if the model returned an empty / useless response that
+      # should be retried.  Three cases:
+      #   1. No event at all (model/adapter yielded nothing)
+      #   2. Last event is partial with no meaningful content (streaming +
+      #      thinking: only thought chunks arrived, no final response)
+      #   3. Last event is a final response with no meaningful content
+      #      (non-streaming empty response, or streaming empty aggregated)
+      is_empty_response = False
+      if not last_event:
+        is_empty_response = True
+      elif last_event.partial and not _has_meaningful_content(last_event):
+        is_empty_response = True
+      elif (
+          last_event.is_final_response()
+          and not _has_meaningful_content(last_event)
+          and last_event.author == invocation_context.agent.name
+      ):
+        is_empty_response = True
+
+      if (
+          is_empty_response
+          and empty_response_count < _MAX_EMPTY_RESPONSE_RETRIES
+      ):
+        empty_response_count += 1
+        logger.warning(
+            'Model returned an empty response (attempt %d/%d),'
+            ' injecting resume message and re-prompting.',
+            empty_response_count,
+            _MAX_EMPTY_RESPONSE_RETRIES,
+        )
+        # Inject a resume nudge into the session so the next LLM call
+        # sees it in its context and is more likely to continue.
+        resume_event = Event(
+            invocation_id=invocation_context.invocation_id,
+            author='user',
+            branch=invocation_context.branch,
+            content=types.Content(
+                role='user',
+                parts=[
+                    types.Part.from_text(text=_EMPTY_RESPONSE_RESUME_MESSAGE)
+                ],
+            ),
+        )
+        yield resume_event
+        continue
+
+      # Normal termination conditions.
+      if not last_event or last_event.is_final_response() or last_event.partial:
         if last_event and last_event.partial:
           logger.warning('The last event is partial, which is not expected.')
         break
-      if last_event.is_final_response():
-        if (
-            not _has_meaningful_content(last_event)
-            and last_event.author == invocation_context.agent.name
-            and empty_response_count < _MAX_EMPTY_RESPONSE_RETRIES
-        ):
-          empty_response_count += 1
-          logger.warning(
-              'Model returned an empty response (attempt %d/%d),'
-              ' injecting resume message and re-prompting.',
-              empty_response_count,
-              _MAX_EMPTY_RESPONSE_RETRIES,
-          )
-          # Inject a resume nudge into the session so the next LLM call
-          # sees it in its context and is more likely to continue.
-          resume_event = Event(
-              invocation_id=invocation_context.invocation_id,
-              author='user',
-              branch=invocation_context.branch,
-              content=types.Content(
-                  role='user',
-                  parts=[
-                      types.Part.from_text(text=_EMPTY_RESPONSE_RESUME_MESSAGE)
-                  ],
-              ),
-          )
-          yield resume_event
-          continue
-        break
 
   async def _run_one_step_async(
       self,
diff --git a/src/google/adk/models/lite_llm.py b/src/google/adk/models/lite_llm.py
@@ -2235,6 +2235,7 @@ async def generate_content_async(
       aggregated_llm_response_with_tool_call = None
       usage_metadata = None
       fallback_index = 0
+      last_model_version = None
 
       def _finalize_tool_call_response(
           *, model_version: str, finish_reason: str
@@ -2319,6 +2320,7 @@ def _reset_stream_buffers() -> None:
         function_calls.clear()
 
       async for part in await self.llm_client.acompletion(**completion_args):
+        last_model_version = part.model
         for chunk, finish_reason in _model_response_to_chunk(part):
           if isinstance(chunk, FunctionChunk):
             index = chunk.index or fallback_index
@@ -2413,6 +2415,22 @@ def _reset_stream_buffers() -> None:
         )
         _reset_stream_buffers()
 
+      # Fallback: if the model produced no meaningful output at all (no text,
+      # no reasoning, no tool calls), yield an explicit empty non-partial
+      # response so that downstream retry logic in run_async() can detect it
+      # and re-prompt instead of silently halting.
+      if (
+          not aggregated_llm_response
+          and not aggregated_llm_response_with_tool_call
+          and last_model_version is not None
+      ):
+        aggregated_llm_response = LlmResponse(
+            content=types.Content(role="model", parts=[]),
+            partial=False,
+            finish_reason=_map_finish_reason("stop"),
+            model_version=last_model_version,
+        )
+
       # waiting until streaming ends to yield the llm_response as litellm tends
       # to send chunk that contains usage_metadata after the chunk with
       # finish_reason set to tool_calls or stop.
diff --git a/tests/unittests/flows/llm_flows/test_base_llm_flow_partial_handling.py b/tests/unittests/flows/llm_flows/test_base_llm_flow_partial_handling.py
@@ -91,12 +91,18 @@ async def test_run_async_breaks_on_final_response():
 
 
 @pytest.mark.asyncio
-async def test_run_async_breaks_on_no_last_event():
-  """Test that run_async breaks when there is no last event."""
-  # Create a mock model that returns an empty response (no content)
-  empty_response = LlmResponse(content=None, partial=False)
+async def test_run_async_retries_then_breaks_on_no_last_event():
+  """Test that run_async retries when there is no last event, then breaks."""
+  # Create a mock model that returns empty responses (no content).
+  # Need enough responses to cover initial call + max retries.
+  from google.adk.flows.llm_flows.base_llm_flow import _MAX_EMPTY_RESPONSE_RETRIES
 
-  mock_model = testing_utils.MockModel.create(responses=[empty_response])
+  empty_responses = [
+      LlmResponse(content=None, partial=False)
+      for _ in range(_MAX_EMPTY_RESPONSE_RETRIES + 1)
+  ]
+
+  mock_model = testing_utils.MockModel.create(responses=empty_responses)
 
   agent = Agent(name='test_agent', model=mock_model)
   invocation_context = await testing_utils.create_invocation_context(
@@ -110,8 +116,11 @@ async def test_run_async_breaks_on_no_last_event():
   async for event in flow.run_async(invocation_context):
     events.append(event)
 
-  # Should have no events because empty responses are filtered out
-  assert len(events) == 0
+  # Should have resume events from retry attempts (one per retry).
+  # The empty LlmResponse has content=None, so _postprocess_async filters
+  # it out — no model events are yielded, only resume nudge events.
+  resume_events = [e for e in events if e.author == 'user']
+  assert len(resume_events) == _MAX_EMPTY_RESPONSE_RETRIES
 
 
 @pytest.mark.asyncio
diff --git a/tests/unittests/flows/llm_flows/test_empty_response_all_scenarios.py b/tests/unittests/flows/llm_flows/test_empty_response_all_scenarios.py