diff --git a/src/benchflow/providers/litellm_bedrock_patch.py b/src/benchflow/providers/litellm_bedrock_patch.py index b98f2c03..fc3cf3d6 100644 --- a/src/benchflow/providers/litellm_bedrock_patch.py +++ b/src/benchflow/providers/litellm_bedrock_patch.py @@ -17,8 +17,27 @@ BEDROCK_ADAPTIVE_THINKING_RE = re.compile( r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))" ) +BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile( + r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)" +) + +# Requested efforts low→high. LiteLLM 1.88.0rc1 only accepts up to ``high`` for +# the Claude 4.x Bedrock IDs matched by ``BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and +# raises on ``xhigh``/``max`` (#737), so those overrides are clamped before being +# handed to the transform. Other adaptive-thinking Bedrock models, such as Fable +# 5, keep their requested effort. This is the standalone (sandbox-deployed) copy +# of the ladder in ``litellm_config``; keep the two in sync. +_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max") +_VALID_EFFORTS = set(_EFFORT_LADDER) +_LITELLM_MAX_EFFORT = "high" -_VALID_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"} + +def _clamp_effort(effort: str) -> str: + if effort not in _EFFORT_LADDER: + return effort + return _EFFORT_LADDER[ + min(_EFFORT_LADDER.index(effort), _EFFORT_LADDER.index(_LITELLM_MAX_EFFORT)) + ] def _is_new_bedrock_claude(model: str | None) -> bool: @@ -28,6 +47,13 @@ def _is_new_bedrock_claude(model: str | None) -> bool: return False +def _should_clamp_effort(model: str | None) -> bool: + try: + return bool(model and BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model.lower())) + except Exception: + return False + + def _patch_anthropic_gate() -> None: try: from litellm.llms.anthropic.chat.transformation import AnthropicConfig @@ -68,6 +94,10 @@ def handle( override = os.environ.get(BEDROCK_THINKING_EFFORT_ENV, "").strip().lower() if override in _VALID_EFFORTS: reasoning_effort = override + # Clamp xhigh/max (which litellm rejects) to the accepted ceiling so + # the request runs at the real maximum instead of raising (#737). + if _should_clamp_effort(model): + reasoning_effort = _clamp_effort(reasoning_effort) return original(self, model, reasoning_effort, optional_params) # Marker for the fail-closed startup preflight (#602): lets the runtime diff --git a/src/benchflow/providers/litellm_config.py b/src/benchflow/providers/litellm_config.py index 870b6d5f..c9ef3f46 100644 --- a/src/benchflow/providers/litellm_config.py +++ b/src/benchflow/providers/litellm_config.py @@ -68,7 +68,29 @@ def custom_cost_per_token(model: str) -> tuple[float, float] | None: r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))", re.IGNORECASE, ) -_BEDROCK_THINKING_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"} +_BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile( + r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)", + re.IGNORECASE, +) +# Efforts a user may *request*, low→high. LiteLLM 1.88.0rc1's Bedrock Converse +# transform only accepts up to ``high`` for the Claude 4.x Bedrock IDs covered +# by ``_BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and raises BadRequestError on +# ``xhigh``/``max`` (#737), so those requested values are clamped to the +# accepted ceiling below before they reach the wire. Other adaptive-thinking +# Bedrock models, such as Fable 5, keep their requested effort. Kept in sync +# with the standalone proxy patch ``litellm_bedrock_patch`` (which cannot import +# this module — it is deployed into the sandbox alone). +_BEDROCK_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max") +_BEDROCK_THINKING_EFFORTS = set(_BEDROCK_EFFORT_LADDER) +_BEDROCK_LITELLM_MAX_EFFORT = "high" + + +def _clamp_bedrock_effort(effort: str) -> str: + """Clamp a requested effort to the highest LiteLLM-accepted rung (#737).""" + ladder = _BEDROCK_EFFORT_LADDER + if effort not in ladder: + return effort + return ladder[min(ladder.index(effort), ladder.index(_BEDROCK_LITELLM_MAX_EFFORT))] @dataclass(frozen=True) @@ -148,6 +170,8 @@ def _bedrock_thinking_effort(model: str, env: dict[str, str]) -> str | None: effort = (env.get(BEDROCK_THINKING_EFFORT_ENV) or "high").strip().lower() if effort not in _BEDROCK_THINKING_EFFORTS: effort = "high" + if _BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model): + return _clamp_bedrock_effort(effort) return effort diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py index 1061698f..88ee550a 100644 --- a/tests/test_bedrock_thinking.py +++ b/tests/test_bedrock_thinking.py @@ -42,12 +42,32 @@ def test_bedrock_thinking_effort_is_threaded_into_route_params(): { "AWS_BEARER_TOKEN_BEDROCK": "token", "AWS_REGION": "us-west-2", - BEDROCK_THINKING_EFFORT_ENV: "max", + BEDROCK_THINKING_EFFORT_ENV: "medium", }, ) assert route.upstream_model == "bedrock/us.anthropic.claude-opus-4-8" - assert route.litellm_params["reasoning_effort"] == "max" + assert route.litellm_params["reasoning_effort"] == "medium" + + +@pytest.mark.parametrize("requested", ["xhigh", "max"]) +def test_bedrock_effort_clamps_unsupported_to_litellm_ceiling(requested): + """Guards PR #739 against #737's unsupported Bedrock effort regression. + + LiteLLM rejects xhigh/max for opus-4-8 with a BadRequestError, so BenchFlow + clamps a requested xhigh/max down to the accepted ceiling (`high`) -- a + MAX-thinking config runs at the real maximum instead of erroring mid-run, + matching the garbage->high default behavior. + """ + route = resolve_litellm_route( + "aws-bedrock/us.anthropic.claude-opus-4-8", + { + "AWS_BEARER_TOKEN_BEDROCK": "token", + "AWS_REGION": "us-west-2", + BEDROCK_THINKING_EFFORT_ENV: requested, + }, + ) + assert route.litellm_params["reasoning_effort"] == "high" def test_bedrock_fable5_thinking_effort_is_threaded_into_route_params(): @@ -183,3 +203,41 @@ def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkey ) # And the host env being empty did not strip it — it came from the run env. assert BEDROCK_THINKING_EFFORT_ENV not in os.environ + + +# --------------------------------------------------------------------------- # +# Clamp end-to-end + drift guard (issue #737) # +# --------------------------------------------------------------------------- # + + +def test_proxy_env_max_override_clamps_in_wire_payload(monkeypatch): + """Guards PR #739 against #737's proxy-process env clamp regression. + + A run-level `max` in the proxy process env is clamped to the LiteLLM ceiling + in the actual wire payload; it must not raise. + """ + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + assert _wire_output_config_effort("high", env_override="max") == "high" + + +def test_every_requestable_bedrock_effort_survives_real_litellm_transform(monkeypatch): + """Guards PR #739 against #737's accepted-effort drift regression. + + Every effort a user may request must, after the route clamp, be accepted by + the REAL litellm Converse transform (no BadRequestError). If a future + litellm changes its accepted set, this fails and forces + _BEDROCK_LITELLM_MAX_EFFORT / the ladder to be revisited rather than + silently shipping a value the provider rejects mid-run. + """ + from benchflow.providers.litellm_config import _BEDROCK_EFFORT_LADDER + + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + for requested in _BEDROCK_EFFORT_LADDER: + route = resolve_litellm_route( + "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", + {**_BEDROCK_BASE_ENV, BEDROCK_THINKING_EFFORT_ENV: requested}, + ) + clamped = route.litellm_params["reasoning_effort"] + # Must not raise — this is the whole point of the clamp. + effort = _wire_output_config_effort(clamped, env_override=None) + assert effort, f"{requested!r} -> {clamped!r} produced no effort" diff --git a/tests/test_litellm_config.py b/tests/test_litellm_config.py index 3d9d2eb5..bca13588 100644 --- a/tests/test_litellm_config.py +++ b/tests/test_litellm_config.py @@ -21,6 +21,7 @@ def test_bedrock_model_maps_to_litellm_bedrock_route(): def test_bedrock_model_honors_max_thinking_effort_env(): + """Guards PR #739 against #737's route-config effort ceiling regression.""" route = resolve_litellm_route( "aws-bedrock/us.anthropic.claude-opus-4-8", { @@ -30,7 +31,10 @@ def test_bedrock_model_honors_max_thinking_effort_env(): }, ) - assert route.litellm_params["reasoning_effort"] == "max" + # `max` is honored as "the highest supported effort": LiteLLM 1.88.0rc1 + # rejects `max`/`xhigh` for opus-4-8, so BenchFlow clamps to the accepted + # ceiling `high` rather than erroring at request time (#737). + assert route.litellm_params["reasoning_effort"] == "high" def test_azure_openai_route_uses_resource_and_preview_version():