From 43c1d79f355bf210a4e6bceff134d5e958f99a7d Mon Sep 17 00:00:00 2001 From: ElegantLin Date: Sat, 13 Jun 2026 17:32:06 +0000 Subject: [PATCH 1/2] test: pin run-level Bedrock thinking-effort end-to-end (Docker/Daytona parity) (#599) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #599 reported that the old host BedrockProxyServer stored per-run env but the Bedrock translators resolved thinking effort from process-global os.environ, so Docker silently fell back to `high` while Daytona honored a run-level BENCHFLOW_BEDROCK_THINKING_EFFORT. PR #613 deleted that proxy (bedrock_proxy.py / bedrock_runtime.py no longer exist); the LiteLLM runtime now sources effort from the run-level agent env via two run-aware paths — route config (reasoning_effort baked into config.yaml from agent_env) and the proxy-process env (launched as os.environ + agent_env). The specific bug is unreachable, but nothing pinned the behavior end to end. Adds wire-level regression coverage (the issue's actual ask — assert output_config.effort, not just route params) by driving the REAL litellm Bedrock Converse transform with the benchflow patch applied: - route-config effort lands in the wire payload with host os.environ empty (sourced from the run, not the host process); - a run-level value in the proxy-process env overrides a stale-default route effort in the wire payload — the exact divergence the old Docker translator got wrong (verified to FAIL when the patch override is neutered); - Docker and Daytona resolve identical effort from the same agent_env, so neither lane can diverge. No production change — the architecture already behaves correctly; this guards the regression. Uses `medium` (litellm-accepted, distinct from the `high` default) so a silent fallback fails the assertion. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/test_bedrock_thinking.py | 101 +++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py index 6b860e91e..19950e7c1 100644 --- a/tests/test_bedrock_thinking.py +++ b/tests/test_bedrock_thinking.py @@ -1,5 +1,7 @@ from __future__ import annotations +import os + import pytest from benchflow.providers.litellm_bedrock_patch import ( @@ -75,3 +77,102 @@ def test_bedrock_thinking_effort_defaults_to_high_and_rejects_garbage(): {**base_env, BEDROCK_THINKING_EFFORT_ENV: "turbo"}, ) assert garbage_route.litellm_params["reasoning_effort"] == "high" + + +# --------------------------------------------------------------------------- # +# End-to-end run-level effort -> Bedrock wire payload, Docker/Daytona parity # +# (issue #599: the old host BedrockProxyServer stored per-run env but the # +# translators read process-global os.environ, so Docker silently fell back to # +# `high`. PR #613 deleted that proxy; this pins the run-level behavior in the # +# replacement LiteLLM runtime so the regression cannot return.) # +# --------------------------------------------------------------------------- # + +# Versioned Bedrock inference-profile ID — the real wire form. Stock litellm +# 1.88.0rc1 does not classify it as adaptive-thinking; the bedrock patch's +# anthropic gate does, which is why output_config.effort is emitted at all. +_BEDROCK_OPUS_48_WIRE_MODEL = "bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0" +_BEDROCK_BASE_ENV = {"AWS_BEARER_TOKEN_BEDROCK": "token", "AWS_REGION": "us-west-2"} + + +def _wire_output_config_effort(reasoning_effort: str, env_override: str | None) -> str: + """Run the REAL litellm Bedrock Converse transform (with the benchflow + patch applied) and return the ``output_config.effort`` it emits. + + ``reasoning_effort`` models the value baked into config.yaml from the route + (``litellm_config`` reads it from the run-level agent env). ``env_override`` + models the proxy *process* environment, which is launched as + ``os.environ + agent_env`` — so a run-level value reaches it too. Either + path landing the effort in the wire payload proves the #599 fix end to end. + """ + from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig + + import benchflow.providers.litellm_bedrock_patch # noqa: F401 — applies patch + + saved = os.environ.get(BEDROCK_THINKING_EFFORT_ENV) + os.environ.pop(BEDROCK_THINKING_EFFORT_ENV, None) + if env_override is not None: + os.environ[BEDROCK_THINKING_EFFORT_ENV] = env_override + try: + optional_params: dict = {} + AmazonConverseConfig()._handle_reasoning_effort_parameter( + _BEDROCK_OPUS_48_WIRE_MODEL, reasoning_effort, optional_params + ) + finally: + os.environ.pop(BEDROCK_THINKING_EFFORT_ENV, None) + if saved is not None: + os.environ[BEDROCK_THINKING_EFFORT_ENV] = saved + cfg = optional_params.get("output_config") + assert isinstance(cfg, dict), ( + f"adaptive-thinking output_config not emitted (patch inactive?): " + f"{optional_params!r}" + ) + return cfg["effort"] + + +def test_run_level_effort_from_route_lands_in_bedrock_wire_payload(monkeypatch): + """#599 end-to-end (route path): a run-level effort baked into config.yaml + reaches the Bedrock Converse ``output_config.effort`` — with the host + process env empty, proving it is sourced from the run, not os.environ. + + Uses ``medium`` (litellm-accepted and distinct from the ``high`` default) + so the assertion fails if the effort silently falls back. + """ + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + assert _wire_output_config_effort("medium", env_override=None) == "medium" + + +def test_run_level_effort_via_proxy_process_env_overrides_stale_route(monkeypatch): + """#599 end-to-end (proxy-env path): even if config.yaml carried a stale + default (``high``), the run-level value present in the proxy process env + (os.environ + agent_env) overrides it in the wire payload — this is the + exact divergence the old Docker translator got wrong.""" + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + assert _wire_output_config_effort("high", env_override="medium") == "medium" + + +def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkeypatch): + """#599 parity: Docker (host proxy) and Daytona (sandbox proxy) build the + route from the SAME ``resolve_litellm_route(model, agent_env)`` call, so the + effort in config.yaml is identical and independent of the host's os.environ. + + The old bug was Docker-only because a host-side translator read process + os.environ; here, with that env scrubbed, the run-level value still flows + through — so neither lane can diverge. + """ + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + run_env = {**_BEDROCK_BASE_ENV, BEDROCK_THINKING_EFFORT_ENV: "medium"} + + # Identical resolution path for both execution environments. + docker_route = resolve_litellm_route( + "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", run_env + ) + daytona_route = resolve_litellm_route( + "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", dict(run_env) + ) + assert ( + docker_route.litellm_params["reasoning_effort"] + == daytona_route.litellm_params["reasoning_effort"] + == "medium" + ) + # And the host env being empty did not strip it — it came from the run env. + assert BEDROCK_THINKING_EFFORT_ENV not in os.environ From 143011865af981231f547f92257e805d343efaca Mon Sep 17 00:00:00 2001 From: Bingran You Date: Mon, 15 Jun 2026 01:19:36 -0400 Subject: [PATCH 2/2] test: document Bedrock effort regression guards --- tests/test_bedrock_thinking.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py index 19950e7c1..1061698fc 100644 --- a/tests/test_bedrock_thinking.py +++ b/tests/test_bedrock_thinking.py @@ -130,9 +130,11 @@ def _wire_output_config_effort(reasoning_effort: str, env_override: str | None) def test_run_level_effort_from_route_lands_in_bedrock_wire_payload(monkeypatch): - """#599 end-to-end (route path): a run-level effort baked into config.yaml - reaches the Bedrock Converse ``output_config.effort`` — with the host - process env empty, proving it is sourced from the run, not os.environ. + """Guards PR #613 against #599's run-env regression on the route path. + + A run-level effort baked into config.yaml reaches the Bedrock Converse + ``output_config.effort`` with the host process env empty, proving it is + sourced from the run, not os.environ. Uses ``medium`` (litellm-accepted and distinct from the ``high`` default) so the assertion fails if the effort silently falls back. @@ -142,18 +144,23 @@ def test_run_level_effort_from_route_lands_in_bedrock_wire_payload(monkeypatch): def test_run_level_effort_via_proxy_process_env_overrides_stale_route(monkeypatch): - """#599 end-to-end (proxy-env path): even if config.yaml carried a stale - default (``high``), the run-level value present in the proxy process env - (os.environ + agent_env) overrides it in the wire payload — this is the - exact divergence the old Docker translator got wrong.""" + """Guards PR #613 against #599's run-env regression on the proxy-env path. + + Even if config.yaml carried a stale default (``high``), the run-level value + present in the proxy process env (os.environ + agent_env) overrides it in + the wire payload, matching the divergence the old Docker translator got + wrong. + """ monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) assert _wire_output_config_effort("high", env_override="medium") == "medium" def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkeypatch): - """#599 parity: Docker (host proxy) and Daytona (sandbox proxy) build the - route from the SAME ``resolve_litellm_route(model, agent_env)`` call, so the - effort in config.yaml is identical and independent of the host's os.environ. + """Guards PR #613 against #599's Docker/Daytona run-env parity regression. + + Docker (host proxy) and Daytona (sandbox proxy) build the route from the + SAME ``resolve_litellm_route(model, agent_env)`` call, so the effort in + config.yaml is identical and independent of the host's os.environ. The old bug was Docker-only because a host-side translator read process os.environ; here, with that env scrubbed, the run-level value still flows