benchflow-ai · bingran-you · Jun 15, 2026 · Jun 13, 2026 · Jun 15, 2026
diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import os
+
 import pytest
 
 from benchflow.providers.litellm_bedrock_patch import (
@@ -75,3 +77,109 @@ def test_bedrock_thinking_effort_defaults_to_high_and_rejects_garbage():
         {**base_env, BEDROCK_THINKING_EFFORT_ENV: "turbo"},
     )
     assert garbage_route.litellm_params["reasoning_effort"] == "high"
+
+
+# --------------------------------------------------------------------------- #
+# End-to-end run-level effort -> Bedrock wire payload, Docker/Daytona parity   #
+# (issue #599: the old host BedrockProxyServer stored per-run env but the      #
+# translators read process-global os.environ, so Docker silently fell back to #
+# `high`. PR #613 deleted that proxy; this pins the run-level behavior in the  #
+# replacement LiteLLM runtime so the regression cannot return.)               #
+# --------------------------------------------------------------------------- #
+
+# Versioned Bedrock inference-profile ID — the real wire form. Stock litellm
+# 1.88.0rc1 does not classify it as adaptive-thinking; the bedrock patch's
+# anthropic gate does, which is why output_config.effort is emitted at all.
+_BEDROCK_OPUS_48_WIRE_MODEL = "bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0"
+_BEDROCK_BASE_ENV = {"AWS_BEARER_TOKEN_BEDROCK": "token", "AWS_REGION": "us-west-2"}
+
+
+def _wire_output_config_effort(reasoning_effort: str, env_override: str | None) -> str:
+    """Run the REAL litellm Bedrock Converse transform (with the benchflow
+    patch applied) and return the ``output_config.effort`` it emits.
+
+    ``reasoning_effort`` models the value baked into config.yaml from the route
+    (``litellm_config`` reads it from the run-level agent env). ``env_override``
+    models the proxy *process* environment, which is launched as
+    ``os.environ + agent_env`` — so a run-level value reaches it too. Either
+    path landing the effort in the wire payload proves the #599 fix end to end.
+    """
+    from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig
+
+    import benchflow.providers.litellm_bedrock_patch  # noqa: F401 — applies patch
+
+    saved = os.environ.get(BEDROCK_THINKING_EFFORT_ENV)
+    os.environ.pop(BEDROCK_THINKING_EFFORT_ENV, None)
+    if env_override is not None:
+        os.environ[BEDROCK_THINKING_EFFORT_ENV] = env_override
+    try:
+        optional_params: dict = {}
+        AmazonConverseConfig()._handle_reasoning_effort_parameter(
+            _BEDROCK_OPUS_48_WIRE_MODEL, reasoning_effort, optional_params
+        )
+    finally:
+        os.environ.pop(BEDROCK_THINKING_EFFORT_ENV, None)
+        if saved is not None:
+            os.environ[BEDROCK_THINKING_EFFORT_ENV] = saved
+    cfg = optional_params.get("output_config")
+    assert isinstance(cfg, dict), (
+        f"adaptive-thinking output_config not emitted (patch inactive?): "
+        f"{optional_params!r}"
+    )
+    return cfg["effort"]
+
+
+def test_run_level_effort_from_route_lands_in_bedrock_wire_payload(monkeypatch):
+    """Guards PR #613 against #599's run-env regression on the route path.
+
+    A run-level effort baked into config.yaml reaches the Bedrock Converse
+    ``output_config.effort`` with the host process env empty, proving it is
+    sourced from the run, not os.environ.
+
+    Uses ``medium`` (litellm-accepted and distinct from the ``high`` default)
+    so the assertion fails if the effort silently falls back.
+    """
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    assert _wire_output_config_effort("medium", env_override=None) == "medium"
+
+
+def test_run_level_effort_via_proxy_process_env_overrides_stale_route(monkeypatch):
+    """Guards PR #613 against #599's run-env regression on the proxy-env path.
+
+    Even if config.yaml carried a stale default (``high``), the run-level value
+    present in the proxy process env (os.environ + agent_env) overrides it in
+    the wire payload, matching the divergence the old Docker translator got
+    wrong.
+    """
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    assert _wire_output_config_effort("high", env_override="medium") == "medium"
+
+
+def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkeypatch):
+    """Guards PR #613 against #599's Docker/Daytona run-env parity regression.
+
+    Docker (host proxy) and Daytona (sandbox proxy) build the route from the
+    SAME ``resolve_litellm_route(model, agent_env)`` call, so the effort in
+    config.yaml is identical and independent of the host's os.environ.
+
+    The old bug was Docker-only because a host-side translator read process
+    os.environ; here, with that env scrubbed, the run-level value still flows
+    through — so neither lane can diverge.
+    """
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    run_env = {**_BEDROCK_BASE_ENV, BEDROCK_THINKING_EFFORT_ENV: "medium"}
+
+    # Identical resolution path for both execution environments.
+    docker_route = resolve_litellm_route(
+        "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", run_env
+    )
+    daytona_route = resolve_litellm_route(
+        "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", dict(run_env)
+    )
+    assert (
+        docker_route.litellm_params["reasoning_effort"]
+        == daytona_route.litellm_params["reasoning_effort"]
+        == "medium"
+    )
+    # And the host env being empty did not strip it — it came from the run env.
+    assert BEDROCK_THINKING_EFFORT_ENV not in os.environ