diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py index 6b860e91..1061698f 100644 --- a/tests/test_bedrock_thinking.py +++ b/tests/test_bedrock_thinking.py @@ -1,5 +1,7 @@ from __future__ import annotations +import os + import pytest from benchflow.providers.litellm_bedrock_patch import ( @@ -75,3 +77,109 @@ def test_bedrock_thinking_effort_defaults_to_high_and_rejects_garbage(): {**base_env, BEDROCK_THINKING_EFFORT_ENV: "turbo"}, ) assert garbage_route.litellm_params["reasoning_effort"] == "high" + + +# --------------------------------------------------------------------------- # +# End-to-end run-level effort -> Bedrock wire payload, Docker/Daytona parity # +# (issue #599: the old host BedrockProxyServer stored per-run env but the # +# translators read process-global os.environ, so Docker silently fell back to # +# `high`. PR #613 deleted that proxy; this pins the run-level behavior in the # +# replacement LiteLLM runtime so the regression cannot return.) # +# --------------------------------------------------------------------------- # + +# Versioned Bedrock inference-profile ID — the real wire form. Stock litellm +# 1.88.0rc1 does not classify it as adaptive-thinking; the bedrock patch's +# anthropic gate does, which is why output_config.effort is emitted at all. +_BEDROCK_OPUS_48_WIRE_MODEL = "bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0" +_BEDROCK_BASE_ENV = {"AWS_BEARER_TOKEN_BEDROCK": "token", "AWS_REGION": "us-west-2"} + + +def _wire_output_config_effort(reasoning_effort: str, env_override: str | None) -> str: + """Run the REAL litellm Bedrock Converse transform (with the benchflow + patch applied) and return the ``output_config.effort`` it emits. + + ``reasoning_effort`` models the value baked into config.yaml from the route + (``litellm_config`` reads it from the run-level agent env). ``env_override`` + models the proxy *process* environment, which is launched as + ``os.environ + agent_env`` — so a run-level value reaches it too. Either + path landing the effort in the wire payload proves the #599 fix end to end. + """ + from litellm.llms.bedrock.chat.converse_transformation import AmazonConverseConfig + + import benchflow.providers.litellm_bedrock_patch # noqa: F401 — applies patch + + saved = os.environ.get(BEDROCK_THINKING_EFFORT_ENV) + os.environ.pop(BEDROCK_THINKING_EFFORT_ENV, None) + if env_override is not None: + os.environ[BEDROCK_THINKING_EFFORT_ENV] = env_override + try: + optional_params: dict = {} + AmazonConverseConfig()._handle_reasoning_effort_parameter( + _BEDROCK_OPUS_48_WIRE_MODEL, reasoning_effort, optional_params + ) + finally: + os.environ.pop(BEDROCK_THINKING_EFFORT_ENV, None) + if saved is not None: + os.environ[BEDROCK_THINKING_EFFORT_ENV] = saved + cfg = optional_params.get("output_config") + assert isinstance(cfg, dict), ( + f"adaptive-thinking output_config not emitted (patch inactive?): " + f"{optional_params!r}" + ) + return cfg["effort"] + + +def test_run_level_effort_from_route_lands_in_bedrock_wire_payload(monkeypatch): + """Guards PR #613 against #599's run-env regression on the route path. + + A run-level effort baked into config.yaml reaches the Bedrock Converse + ``output_config.effort`` with the host process env empty, proving it is + sourced from the run, not os.environ. + + Uses ``medium`` (litellm-accepted and distinct from the ``high`` default) + so the assertion fails if the effort silently falls back. + """ + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + assert _wire_output_config_effort("medium", env_override=None) == "medium" + + +def test_run_level_effort_via_proxy_process_env_overrides_stale_route(monkeypatch): + """Guards PR #613 against #599's run-env regression on the proxy-env path. + + Even if config.yaml carried a stale default (``high``), the run-level value + present in the proxy process env (os.environ + agent_env) overrides it in + the wire payload, matching the divergence the old Docker translator got + wrong. + """ + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + assert _wire_output_config_effort("high", env_override="medium") == "medium" + + +def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkeypatch): + """Guards PR #613 against #599's Docker/Daytona run-env parity regression. + + Docker (host proxy) and Daytona (sandbox proxy) build the route from the + SAME ``resolve_litellm_route(model, agent_env)`` call, so the effort in + config.yaml is identical and independent of the host's os.environ. + + The old bug was Docker-only because a host-side translator read process + os.environ; here, with that env scrubbed, the run-level value still flows + through — so neither lane can diverge. + """ + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + run_env = {**_BEDROCK_BASE_ENV, BEDROCK_THINKING_EFFORT_ENV: "medium"} + + # Identical resolution path for both execution environments. + docker_route = resolve_litellm_route( + "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", run_env + ) + daytona_route = resolve_litellm_route( + "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", dict(run_env) + ) + assert ( + docker_route.litellm_params["reasoning_effort"] + == daytona_route.litellm_params["reasoning_effort"] + == "medium" + ) + # And the host env being empty did not strip it — it came from the run env. + assert BEDROCK_THINKING_EFFORT_ENV not in os.environ