From dd41cd9a3e61dd6317e588f61e369697018ef19c Mon Sep 17 00:00:00 2001 From: ElegantLin Date: Sat, 13 Jun 2026 18:00:32 +0000 Subject: [PATCH 1/2] fix: clamp Bedrock thinking effort to the LiteLLM-accepted ceiling (#737) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit BenchFlow accepts BENCHFLOW_BEDROCK_THINKING_EFFORT values `xhigh`/`max`, but LiteLLM 1.88.0rc1's Bedrock Converse transform rejects them for adaptive Claude 4.8+ with a BadRequestError. So the #598 "MAX thinking" config errored at request time instead of running at the highest supported effort. Clamp a requested effort to the accepted ceiling (`high`) in both injection paths — the route config (litellm_config) and the standalone proxy patch (litellm_bedrock_patch, which cannot import benchflow, so it carries its own in-sync copy of the ladder). This matches the existing garbage->high default: a MAX-thinking run now proceeds at the real maximum rather than crashing. Tests: - clamp asserted on the route param (xhigh/max -> high) and updated the two pre-existing tests that encoded the bug (effort `max` threaded through); - proxy-env override `max` clamps in the real wire payload without raising; - drift guard: every requestable effort, after the clamp, is accepted by the REAL litellm Converse transform — if a future litellm changes its accepted set this fails loudly instead of shipping a rejected value mid-run. Both clamps verified to fail-then-pass when individually neutered. Closes #737 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../providers/litellm_bedrock_patch.py | 20 ++++++- src/benchflow/providers/litellm_config.py | 21 +++++++- tests/test_bedrock_thinking.py | 54 ++++++++++++++++++- tests/test_litellm_config.py | 5 +- 4 files changed, 94 insertions(+), 6 deletions(-) diff --git a/src/benchflow/providers/litellm_bedrock_patch.py b/src/benchflow/providers/litellm_bedrock_patch.py index b98f2c03..5ef6a1f6 100644 --- a/src/benchflow/providers/litellm_bedrock_patch.py +++ b/src/benchflow/providers/litellm_bedrock_patch.py @@ -18,7 +18,22 @@ r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))" ) -_VALID_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"} +# Requested efforts low→high. LiteLLM 1.88.0rc1 only accepts up to ``high`` for +# adaptive Claude 4.8+ and raises on ``xhigh``/``max`` (#737), so an override is +# clamped to the accepted ceiling before being handed to the transform. This is +# the standalone (sandbox-deployed) copy of the ladder in ``litellm_config``; +# keep the two in sync. +_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max") +_VALID_EFFORTS = set(_EFFORT_LADDER) +_LITELLM_MAX_EFFORT = "high" + + +def _clamp_effort(effort: str) -> str: + if effort not in _EFFORT_LADDER: + return effort + return _EFFORT_LADDER[ + min(_EFFORT_LADDER.index(effort), _EFFORT_LADDER.index(_LITELLM_MAX_EFFORT)) + ] def _is_new_bedrock_claude(model: str | None) -> bool: @@ -68,6 +83,9 @@ def handle( override = os.environ.get(BEDROCK_THINKING_EFFORT_ENV, "").strip().lower() if override in _VALID_EFFORTS: reasoning_effort = override + # Clamp xhigh/max (which litellm rejects) to the accepted ceiling so + # the request runs at the real maximum instead of raising (#737). + reasoning_effort = _clamp_effort(reasoning_effort) return original(self, model, reasoning_effort, optional_params) # Marker for the fail-closed startup preflight (#602): lets the runtime diff --git a/src/benchflow/providers/litellm_config.py b/src/benchflow/providers/litellm_config.py index 870b6d5f..810dab37 100644 --- a/src/benchflow/providers/litellm_config.py +++ b/src/benchflow/providers/litellm_config.py @@ -68,7 +68,24 @@ def custom_cost_per_token(model: str) -> tuple[float, float] | None: r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))", re.IGNORECASE, ) -_BEDROCK_THINKING_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"} +# Efforts a user may *request*, low→high. LiteLLM 1.88.0rc1's Bedrock Converse +# transform only accepts up to ``high`` for adaptive Claude 4.8+ and raises +# BadRequestError on ``xhigh``/``max`` (#737), so requested values are clamped +# to the accepted ceiling below before they reach the wire — a MAX-thinking +# config then runs at the real maximum instead of erroring mid-run. Kept in +# sync with the standalone proxy patch ``litellm_bedrock_patch`` (which cannot +# import this module — it is deployed into the sandbox alone). +_BEDROCK_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max") +_BEDROCK_THINKING_EFFORTS = set(_BEDROCK_EFFORT_LADDER) +_BEDROCK_LITELLM_MAX_EFFORT = "high" + + +def _clamp_bedrock_effort(effort: str) -> str: + """Clamp a requested effort to the highest LiteLLM-accepted rung (#737).""" + ladder = _BEDROCK_EFFORT_LADDER + if effort not in ladder: + return effort + return ladder[min(ladder.index(effort), ladder.index(_BEDROCK_LITELLM_MAX_EFFORT))] @dataclass(frozen=True) @@ -148,7 +165,7 @@ def _bedrock_thinking_effort(model: str, env: dict[str, str]) -> str | None: effort = (env.get(BEDROCK_THINKING_EFFORT_ENV) or "high").strip().lower() if effort not in _BEDROCK_THINKING_EFFORTS: effort = "high" - return effort + return _clamp_bedrock_effort(effort) def _route_registered_provider( diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py index 1061698f..27cf8b0f 100644 --- a/tests/test_bedrock_thinking.py +++ b/tests/test_bedrock_thinking.py @@ -42,12 +42,29 @@ def test_bedrock_thinking_effort_is_threaded_into_route_params(): { "AWS_BEARER_TOKEN_BEDROCK": "token", "AWS_REGION": "us-west-2", - BEDROCK_THINKING_EFFORT_ENV: "max", + BEDROCK_THINKING_EFFORT_ENV: "medium", }, ) assert route.upstream_model == "bedrock/us.anthropic.claude-opus-4-8" - assert route.litellm_params["reasoning_effort"] == "max" + assert route.litellm_params["reasoning_effort"] == "medium" + + +@pytest.mark.parametrize("requested", ["xhigh", "max"]) +def test_bedrock_effort_clamps_unsupported_to_litellm_ceiling(requested): + """Guards #737: LiteLLM rejects xhigh/max for opus-4-8 with a + BadRequestError, so BenchFlow clamps a requested xhigh/max down to the + accepted ceiling (`high`) — a MAX-thinking config runs at the real maximum + instead of erroring mid-run, matching the garbage->high default behavior.""" + route = resolve_litellm_route( + "aws-bedrock/us.anthropic.claude-opus-4-8", + { + "AWS_BEARER_TOKEN_BEDROCK": "token", + "AWS_REGION": "us-west-2", + BEDROCK_THINKING_EFFORT_ENV: requested, + }, + ) + assert route.litellm_params["reasoning_effort"] == "high" def test_bedrock_fable5_thinking_effort_is_threaded_into_route_params(): @@ -183,3 +200,36 @@ def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkey ) # And the host env being empty did not strip it — it came from the run env. assert BEDROCK_THINKING_EFFORT_ENV not in os.environ + + +# --------------------------------------------------------------------------- # +# Clamp end-to-end + drift guard (issue #737) # +# --------------------------------------------------------------------------- # + + +def test_proxy_env_max_override_clamps_in_wire_payload(monkeypatch): + """#737 (patch path): a run-level `max` in the proxy process env is clamped + to the LiteLLM ceiling in the actual wire payload — it must NOT raise.""" + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + assert _wire_output_config_effort("high", env_override="max") == "high" + + +def test_every_requestable_bedrock_effort_survives_real_litellm_transform(monkeypatch): + """Drift guard for #737: every effort a user may request must, after the + route clamp, be accepted by the REAL litellm Converse transform (no + BadRequestError). If a future litellm changes its accepted set, this fails + and forces _BEDROCK_LITELLM_MAX_EFFORT / the ladder to be revisited rather + than silently shipping a value the provider rejects mid-run. + """ + from benchflow.providers.litellm_config import _BEDROCK_EFFORT_LADDER + + monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) + for requested in _BEDROCK_EFFORT_LADDER: + route = resolve_litellm_route( + "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0", + {**_BEDROCK_BASE_ENV, BEDROCK_THINKING_EFFORT_ENV: requested}, + ) + clamped = route.litellm_params["reasoning_effort"] + # Must not raise — this is the whole point of the clamp. + effort = _wire_output_config_effort(clamped, env_override=None) + assert effort, f"{requested!r} -> {clamped!r} produced no effort" diff --git a/tests/test_litellm_config.py b/tests/test_litellm_config.py index 3d9d2eb5..a1310787 100644 --- a/tests/test_litellm_config.py +++ b/tests/test_litellm_config.py @@ -30,7 +30,10 @@ def test_bedrock_model_honors_max_thinking_effort_env(): }, ) - assert route.litellm_params["reasoning_effort"] == "max" + # `max` is honored as "the highest supported effort": LiteLLM 1.88.0rc1 + # rejects `max`/`xhigh` for opus-4-8, so BenchFlow clamps to the accepted + # ceiling `high` rather than erroring at request time (#737). + assert route.litellm_params["reasoning_effort"] == "high" def test_azure_openai_route_uses_resource_and_preview_version(): From be86095bc511d65d8e11e93d095e17028a91943e Mon Sep 17 00:00:00 2001 From: Bingran You Date: Mon, 15 Jun 2026 01:53:30 -0400 Subject: [PATCH 2/2] fix: scope Bedrock effort clamp to LiteLLM-limited models --- .../providers/litellm_bedrock_patch.py | 22 ++++++++++---- src/benchflow/providers/litellm_config.py | 21 ++++++++----- tests/test_bedrock_thinking.py | 30 ++++++++++++------- tests/test_litellm_config.py | 1 + 4 files changed, 51 insertions(+), 23 deletions(-) diff --git a/src/benchflow/providers/litellm_bedrock_patch.py b/src/benchflow/providers/litellm_bedrock_patch.py index 5ef6a1f6..fc3cf3d6 100644 --- a/src/benchflow/providers/litellm_bedrock_patch.py +++ b/src/benchflow/providers/litellm_bedrock_patch.py @@ -17,12 +17,16 @@ BEDROCK_ADAPTIVE_THINKING_RE = re.compile( r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))" ) +BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile( + r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)" +) # Requested efforts low→high. LiteLLM 1.88.0rc1 only accepts up to ``high`` for -# adaptive Claude 4.8+ and raises on ``xhigh``/``max`` (#737), so an override is -# clamped to the accepted ceiling before being handed to the transform. This is -# the standalone (sandbox-deployed) copy of the ladder in ``litellm_config``; -# keep the two in sync. +# the Claude 4.x Bedrock IDs matched by ``BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and +# raises on ``xhigh``/``max`` (#737), so those overrides are clamped before being +# handed to the transform. Other adaptive-thinking Bedrock models, such as Fable +# 5, keep their requested effort. This is the standalone (sandbox-deployed) copy +# of the ladder in ``litellm_config``; keep the two in sync. _EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max") _VALID_EFFORTS = set(_EFFORT_LADDER) _LITELLM_MAX_EFFORT = "high" @@ -43,6 +47,13 @@ def _is_new_bedrock_claude(model: str | None) -> bool: return False +def _should_clamp_effort(model: str | None) -> bool: + try: + return bool(model and BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model.lower())) + except Exception: + return False + + def _patch_anthropic_gate() -> None: try: from litellm.llms.anthropic.chat.transformation import AnthropicConfig @@ -85,7 +96,8 @@ def handle( reasoning_effort = override # Clamp xhigh/max (which litellm rejects) to the accepted ceiling so # the request runs at the real maximum instead of raising (#737). - reasoning_effort = _clamp_effort(reasoning_effort) + if _should_clamp_effort(model): + reasoning_effort = _clamp_effort(reasoning_effort) return original(self, model, reasoning_effort, optional_params) # Marker for the fail-closed startup preflight (#602): lets the runtime diff --git a/src/benchflow/providers/litellm_config.py b/src/benchflow/providers/litellm_config.py index 810dab37..c9ef3f46 100644 --- a/src/benchflow/providers/litellm_config.py +++ b/src/benchflow/providers/litellm_config.py @@ -68,13 +68,18 @@ def custom_cost_per_token(model: str) -> tuple[float, float] | None: r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))", re.IGNORECASE, ) +_BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile( + r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)", + re.IGNORECASE, +) # Efforts a user may *request*, low→high. LiteLLM 1.88.0rc1's Bedrock Converse -# transform only accepts up to ``high`` for adaptive Claude 4.8+ and raises -# BadRequestError on ``xhigh``/``max`` (#737), so requested values are clamped -# to the accepted ceiling below before they reach the wire — a MAX-thinking -# config then runs at the real maximum instead of erroring mid-run. Kept in -# sync with the standalone proxy patch ``litellm_bedrock_patch`` (which cannot -# import this module — it is deployed into the sandbox alone). +# transform only accepts up to ``high`` for the Claude 4.x Bedrock IDs covered +# by ``_BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and raises BadRequestError on +# ``xhigh``/``max`` (#737), so those requested values are clamped to the +# accepted ceiling below before they reach the wire. Other adaptive-thinking +# Bedrock models, such as Fable 5, keep their requested effort. Kept in sync +# with the standalone proxy patch ``litellm_bedrock_patch`` (which cannot import +# this module — it is deployed into the sandbox alone). _BEDROCK_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max") _BEDROCK_THINKING_EFFORTS = set(_BEDROCK_EFFORT_LADDER) _BEDROCK_LITELLM_MAX_EFFORT = "high" @@ -165,7 +170,9 @@ def _bedrock_thinking_effort(model: str, env: dict[str, str]) -> str | None: effort = (env.get(BEDROCK_THINKING_EFFORT_ENV) or "high").strip().lower() if effort not in _BEDROCK_THINKING_EFFORTS: effort = "high" - return _clamp_bedrock_effort(effort) + if _BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model): + return _clamp_bedrock_effort(effort) + return effort def _route_registered_provider( diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py index 27cf8b0f..88ee550a 100644 --- a/tests/test_bedrock_thinking.py +++ b/tests/test_bedrock_thinking.py @@ -52,10 +52,13 @@ def test_bedrock_thinking_effort_is_threaded_into_route_params(): @pytest.mark.parametrize("requested", ["xhigh", "max"]) def test_bedrock_effort_clamps_unsupported_to_litellm_ceiling(requested): - """Guards #737: LiteLLM rejects xhigh/max for opus-4-8 with a - BadRequestError, so BenchFlow clamps a requested xhigh/max down to the - accepted ceiling (`high`) — a MAX-thinking config runs at the real maximum - instead of erroring mid-run, matching the garbage->high default behavior.""" + """Guards PR #739 against #737's unsupported Bedrock effort regression. + + LiteLLM rejects xhigh/max for opus-4-8 with a BadRequestError, so BenchFlow + clamps a requested xhigh/max down to the accepted ceiling (`high`) -- a + MAX-thinking config runs at the real maximum instead of erroring mid-run, + matching the garbage->high default behavior. + """ route = resolve_litellm_route( "aws-bedrock/us.anthropic.claude-opus-4-8", { @@ -208,18 +211,23 @@ def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkey def test_proxy_env_max_override_clamps_in_wire_payload(monkeypatch): - """#737 (patch path): a run-level `max` in the proxy process env is clamped - to the LiteLLM ceiling in the actual wire payload — it must NOT raise.""" + """Guards PR #739 against #737's proxy-process env clamp regression. + + A run-level `max` in the proxy process env is clamped to the LiteLLM ceiling + in the actual wire payload; it must not raise. + """ monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False) assert _wire_output_config_effort("high", env_override="max") == "high" def test_every_requestable_bedrock_effort_survives_real_litellm_transform(monkeypatch): - """Drift guard for #737: every effort a user may request must, after the - route clamp, be accepted by the REAL litellm Converse transform (no - BadRequestError). If a future litellm changes its accepted set, this fails - and forces _BEDROCK_LITELLM_MAX_EFFORT / the ladder to be revisited rather - than silently shipping a value the provider rejects mid-run. + """Guards PR #739 against #737's accepted-effort drift regression. + + Every effort a user may request must, after the route clamp, be accepted by + the REAL litellm Converse transform (no BadRequestError). If a future + litellm changes its accepted set, this fails and forces + _BEDROCK_LITELLM_MAX_EFFORT / the ladder to be revisited rather than + silently shipping a value the provider rejects mid-run. """ from benchflow.providers.litellm_config import _BEDROCK_EFFORT_LADDER diff --git a/tests/test_litellm_config.py b/tests/test_litellm_config.py index a1310787..bca13588 100644 --- a/tests/test_litellm_config.py +++ b/tests/test_litellm_config.py @@ -21,6 +21,7 @@ def test_bedrock_model_maps_to_litellm_bedrock_route(): def test_bedrock_model_honors_max_thinking_effort_env(): + """Guards PR #739 against #737's route-config effort ceiling regression.""" route = resolve_litellm_route( "aws-bedrock/us.anthropic.claude-opus-4-8", {