From 43c1d79f355bf210a4e6bceff134d5e958f99a7d Mon Sep 17 00:00:00 2001
From: ElegantLin <elegant.lin21@gmail.com>
Date: Sat, 13 Jun 2026 17:32:06 +0000
Subject: [PATCH 1/2] test: pin run-level Bedrock thinking-effort end-to-end
 (Docker/Daytona parity) (#599)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#599 reported that the old host BedrockProxyServer stored per-run env but the
Bedrock translators resolved thinking effort from process-global os.environ, so
Docker silently fell back to `high` while Daytona honored a run-level
BENCHFLOW_BEDROCK_THINKING_EFFORT. PR #613 deleted that proxy (bedrock_proxy.py
/ bedrock_runtime.py no longer exist); the LiteLLM runtime now sources effort
from the run-level agent env via two run-aware paths — route config
(reasoning_effort baked into config.yaml from agent_env) and the proxy-process
env (launched as os.environ + agent_env). The specific bug is unreachable, but
nothing pinned the behavior end to end.

Adds wire-level regression coverage (the issue's actual ask — assert
output_config.effort, not just route params) by driving the REAL litellm
Bedrock Converse transform with the benchflow patch applied:

- route-config effort lands in the wire payload with host os.environ empty
  (sourced from the run, not the host process);
- a run-level value in the proxy-process env overrides a stale-default route
  effort in the wire payload — the exact divergence the old Docker translator
  got wrong (verified to FAIL when the patch override is neutered);
- Docker and Daytona resolve identical effort from the same agent_env, so
  neither lane can diverge.

No production change — the architecture already behaves correctly; this guards
the regression. Uses `medium` (litellm-accepted, distinct from the `high`
default) so a silent fallback fails the assertion.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/test_bedrock_thinking.py | 101 +++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)

diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py
index 6b860e91e..19950e7c1 100644
--- a/tests/test_bedrock_thinking.py
+++ b/tests/test_bedrock_thinking.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import os
+
 import pytest
 
 from benchflow.providers.litellm_bedrock_patch import (
@@ -75,3 +77,102 @@ def test_bedrock_thinking_effort_defaults_to_high_and_rejects_garbage():
         {**base_env, BEDROCK_THINKING_EFFORT_ENV: "turbo"},
     )
     assert garbage_route.litellm_params["reasoning_effort"] == "high"
+
+
+# --------------------------------------------------------------------------- #
+# End-to-end run-level effort -> Bedrock wire payload, Docker/Daytona parity   #
+# (issue #599: the old host BedrockProxyServer stored per-run env but the      #
+# translators read process-global os.environ, so Docker silently fell back to #
+# `high`. PR #613 deleted that proxy; this pins the run-level behavior in the  #
+# replacement LiteLLM runtime so the regression cannot return.)               #
+# --------------------------------------------------------------------------- #
+
+# Versioned Bedrock inference-profile ID — the real wire form. Stock litellm
+# 1.88.0rc1 does not classify it as adaptive-thinking; the bedrock patch's
+# anthropic gate does, which is why output_config.effort is emitted at all.
+_BEDROCK_OPUS_48_WIRE_MODEL = "bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0"
+_BEDROCK_BASE_ENV = {"AWS_BEARER_TOKEN_BEDROCK": "token", "AWS_REGION": "us-west-2"}
+
+
+def _wire_output_config_effort(reasoning_effort: str, env_override: str | None) -> str:
+    """Run the REAL litellm Bedrock Converse transform (with the benchflow
+    patch applied) and return the ``output_config.effort`` it emits.
+
+    ``reasoning_effort`` models the value baked into config.yaml from the route
+    (``litellm_config`` reads it from the run-level agent env). ``env_override``
+    models the proxy *process* environment, which is launched as
+    ``os.environ + agent_env`` — so a run-level value reaches it too. Either
+    path landing the effort in the wire payload proves the #599 fix end to end.
+    """
+    from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig
+
+    import benchflow.providers.litellm_bedrock_patch  # noqa: F401 — applies patch
+
+    saved = os.environ.get(BEDROCK_THINKING_EFFORT_ENV)
+    os.environ.pop(BEDROCK_THINKING_EFFORT_ENV, None)
+    if env_override is not None:
+        os.environ[BEDROCK_THINKING_EFFORT_ENV] = env_override
+    try:
+        optional_params: dict = {}
+        AmazonConverseConfig()._handle_reasoning_effort_parameter(
+            _BEDROCK_OPUS_48_WIRE_MODEL, reasoning_effort, optional_params
+        )
+    finally:
+        os.environ.pop(BEDROCK_THINKING_EFFORT_ENV, None)
+        if saved is not None:
+            os.environ[BEDROCK_THINKING_EFFORT_ENV] = saved
+    cfg = optional_params.get("output_config")
+    assert isinstance(cfg, dict), (
+        f"adaptive-thinking output_config not emitted (patch inactive?): "
+        f"{optional_params!r}"
+    )
+    return cfg["effort"]
+
+
+def test_run_level_effort_from_route_lands_in_bedrock_wire_payload(monkeypatch):
+    """#599 end-to-end (route path): a run-level effort baked into config.yaml
+    reaches the Bedrock Converse ``output_config.effort`` — with the host
+    process env empty, proving it is sourced from the run, not os.environ.
+
+    Uses ``medium`` (litellm-accepted and distinct from the ``high`` default)
+    so the assertion fails if the effort silently falls back.
+    """
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    assert _wire_output_config_effort("medium", env_override=None) == "medium"
+
+
+def test_run_level_effort_via_proxy_process_env_overrides_stale_route(monkeypatch):
+    """#599 end-to-end (proxy-env path): even if config.yaml carried a stale
+    default (``high``), the run-level value present in the proxy process env
+    (os.environ + agent_env) overrides it in the wire payload — this is the
+    exact divergence the old Docker translator got wrong."""
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    assert _wire_output_config_effort("high", env_override="medium") == "medium"
+
+
+def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkeypatch):
+    """#599 parity: Docker (host proxy) and Daytona (sandbox proxy) build the
+    route from the SAME ``resolve_litellm_route(model, agent_env)`` call, so the
+    effort in config.yaml is identical and independent of the host's os.environ.
+
+    The old bug was Docker-only because a host-side translator read process
+    os.environ; here, with that env scrubbed, the run-level value still flows
+    through — so neither lane can diverge.
+    """
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    run_env = {**_BEDROCK_BASE_ENV, BEDROCK_THINKING_EFFORT_ENV: "medium"}
+
+    # Identical resolution path for both execution environments.
+    docker_route = resolve_litellm_route(
+        "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", run_env
+    )
+    daytona_route = resolve_litellm_route(
+        "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", dict(run_env)
+    )
+    assert (
+        docker_route.litellm_params["reasoning_effort"]
+        == daytona_route.litellm_params["reasoning_effort"]
+        == "medium"
+    )
+    # And the host env being empty did not strip it — it came from the run env.
+    assert BEDROCK_THINKING_EFFORT_ENV not in os.environ

From 143011865af981231f547f92257e805d343efaca Mon Sep 17 00:00:00 2001
From: Bingran You <bingran.you@berkeley.edu>
Date: Mon, 15 Jun 2026 01:19:36 -0400
Subject: [PATCH 2/2] test: document Bedrock effort regression guards

---
 tests/test_bedrock_thinking.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py
index 19950e7c1..1061698fc 100644
--- a/tests/test_bedrock_thinking.py
+++ b/tests/test_bedrock_thinking.py
@@ -130,9 +130,11 @@ def _wire_output_config_effort(reasoning_effort: str, env_override: str | None)
 
 
 def test_run_level_effort_from_route_lands_in_bedrock_wire_payload(monkeypatch):
-    """#599 end-to-end (route path): a run-level effort baked into config.yaml
-    reaches the Bedrock Converse ``output_config.effort`` — with the host
-    process env empty, proving it is sourced from the run, not os.environ.
+    """Guards PR #613 against #599's run-env regression on the route path.
+
+    A run-level effort baked into config.yaml reaches the Bedrock Converse
+    ``output_config.effort`` with the host process env empty, proving it is
+    sourced from the run, not os.environ.
 
     Uses ``medium`` (litellm-accepted and distinct from the ``high`` default)
     so the assertion fails if the effort silently falls back.
@@ -142,18 +144,23 @@ def test_run_level_effort_from_route_lands_in_bedrock_wire_payload(monkeypatch):
 
 
 def test_run_level_effort_via_proxy_process_env_overrides_stale_route(monkeypatch):
-    """#599 end-to-end (proxy-env path): even if config.yaml carried a stale
-    default (``high``), the run-level value present in the proxy process env
-    (os.environ + agent_env) overrides it in the wire payload — this is the
-    exact divergence the old Docker translator got wrong."""
+    """Guards PR #613 against #599's run-env regression on the proxy-env path.
+
+    Even if config.yaml carried a stale default (``high``), the run-level value
+    present in the proxy process env (os.environ + agent_env) overrides it in
+    the wire payload, matching the divergence the old Docker translator got
+    wrong.
+    """
     monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
     assert _wire_output_config_effort("high", env_override="medium") == "medium"
 
 
 def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkeypatch):
-    """#599 parity: Docker (host proxy) and Daytona (sandbox proxy) build the
-    route from the SAME ``resolve_litellm_route(model, agent_env)`` call, so the
-    effort in config.yaml is identical and independent of the host's os.environ.
+    """Guards PR #613 against #599's Docker/Daytona run-env parity regression.
+
+    Docker (host proxy) and Daytona (sandbox proxy) build the route from the
+    SAME ``resolve_litellm_route(model, agent_env)`` call, so the effort in
+    config.yaml is identical and independent of the host's os.environ.
 
     The old bug was Docker-only because a host-side translator read process
     os.environ; here, with that env scrubbed, the run-level value still flows