benchflow-ai · bingran-you · Jun 15, 2026 · Jun 13, 2026 · Jun 15, 2026
diff --git a/src/benchflow/providers/litellm_bedrock_patch.py b/src/benchflow/providers/litellm_bedrock_patch.py
@@ -17,8 +17,27 @@
 BEDROCK_ADAPTIVE_THINKING_RE = re.compile(
     r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))"
 )
+BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile(
+    r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)"
+)
+
+# Requested efforts low→high. LiteLLM 1.88.0rc1 only accepts up to ``high`` for
+# the Claude 4.x Bedrock IDs matched by ``BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and
+# raises on ``xhigh``/``max`` (#737), so those overrides are clamped before being
+# handed to the transform. Other adaptive-thinking Bedrock models, such as Fable
+# 5, keep their requested effort. This is the standalone (sandbox-deployed) copy
+# of the ladder in ``litellm_config``; keep the two in sync.
+_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max")
+_VALID_EFFORTS = set(_EFFORT_LADDER)
+_LITELLM_MAX_EFFORT = "high"
 
-_VALID_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"}
+
+def _clamp_effort(effort: str) -> str:
+    if effort not in _EFFORT_LADDER:
+        return effort
+    return _EFFORT_LADDER[
+        min(_EFFORT_LADDER.index(effort), _EFFORT_LADDER.index(_LITELLM_MAX_EFFORT))
+    ]
 
 
 def _is_new_bedrock_claude(model: str | None) -> bool:
@@ -28,6 +47,13 @@ def _is_new_bedrock_claude(model: str | None) -> bool:
         return False
 
 
+def _should_clamp_effort(model: str | None) -> bool:
+    try:
+        return bool(model and BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model.lower()))
+    except Exception:
+        return False
+
+
 def _patch_anthropic_gate() -> None:
     try:
         from litellm.llms.anthropic.chat.transformation import AnthropicConfig
@@ -68,6 +94,10 @@ def handle(
             override = os.environ.get(BEDROCK_THINKING_EFFORT_ENV, "").strip().lower()
             if override in _VALID_EFFORTS:
                 reasoning_effort = override
+            # Clamp xhigh/max (which litellm rejects) to the accepted ceiling so
+            # the request runs at the real maximum instead of raising (#737).
+            if _should_clamp_effort(model):
+                reasoning_effort = _clamp_effort(reasoning_effort)
         return original(self, model, reasoning_effort, optional_params)
 
     # Marker for the fail-closed startup preflight (#602): lets the runtime

diff --git a/src/benchflow/providers/litellm_config.py b/src/benchflow/providers/litellm_config.py
@@ -68,7 +68,29 @@ def custom_cost_per_token(model: str) -> tuple[float, float] | None:
     r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))",
     re.IGNORECASE,
 )
-_BEDROCK_THINKING_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"}
+_BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile(
+    r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)",
+    re.IGNORECASE,
+)
+# Efforts a user may *request*, low→high. LiteLLM 1.88.0rc1's Bedrock Converse
+# transform only accepts up to ``high`` for the Claude 4.x Bedrock IDs covered
+# by ``_BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and raises BadRequestError on
+# ``xhigh``/``max`` (#737), so those requested values are clamped to the
+# accepted ceiling below before they reach the wire. Other adaptive-thinking
+# Bedrock models, such as Fable 5, keep their requested effort. Kept in sync
+# with the standalone proxy patch ``litellm_bedrock_patch`` (which cannot import
+# this module — it is deployed into the sandbox alone).
+_BEDROCK_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max")
+_BEDROCK_THINKING_EFFORTS = set(_BEDROCK_EFFORT_LADDER)
+_BEDROCK_LITELLM_MAX_EFFORT = "high"
+
+
+def _clamp_bedrock_effort(effort: str) -> str:
+    """Clamp a requested effort to the highest LiteLLM-accepted rung (#737)."""
+    ladder = _BEDROCK_EFFORT_LADDER
+    if effort not in ladder:
+        return effort
+    return ladder[min(ladder.index(effort), ladder.index(_BEDROCK_LITELLM_MAX_EFFORT))]
 
 
 @dataclass(frozen=True)
@@ -148,6 +170,8 @@ def _bedrock_thinking_effort(model: str, env: dict[str, str]) -> str | None:
     effort = (env.get(BEDROCK_THINKING_EFFORT_ENV) or "high").strip().lower()
     if effort not in _BEDROCK_THINKING_EFFORTS:
         effort = "high"
+    if _BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model):
+        return _clamp_bedrock_effort(effort)
     return effort
 
 

diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py
@@ -42,12 +42,32 @@ def test_bedrock_thinking_effort_is_threaded_into_route_params():
         {
             "AWS_BEARER_TOKEN_BEDROCK": "token",
             "AWS_REGION": "us-west-2",
-            BEDROCK_THINKING_EFFORT_ENV: "max",
+            BEDROCK_THINKING_EFFORT_ENV: "medium",
         },
     )
 
     assert route.upstream_model == "bedrock/us.anthropic.claude-opus-4-8"
-    assert route.litellm_params["reasoning_effort"] == "max"
+    assert route.litellm_params["reasoning_effort"] == "medium"
+
+
+@pytest.mark.parametrize("requested", ["xhigh", "max"])
+def test_bedrock_effort_clamps_unsupported_to_litellm_ceiling(requested):
+    """Guards PR #739 against #737's unsupported Bedrock effort regression.
+
+    LiteLLM rejects xhigh/max for opus-4-8 with a BadRequestError, so BenchFlow
+    clamps a requested xhigh/max down to the accepted ceiling (`high`) -- a
+    MAX-thinking config runs at the real maximum instead of erroring mid-run,
+    matching the garbage->high default behavior.
+    """
+    route = resolve_litellm_route(
+        "aws-bedrock/us.anthropic.claude-opus-4-8",
+        {
+            "AWS_BEARER_TOKEN_BEDROCK": "token",
+            "AWS_REGION": "us-west-2",
+            BEDROCK_THINKING_EFFORT_ENV: requested,
+        },
+    )
+    assert route.litellm_params["reasoning_effort"] == "high"
 
 
 def test_bedrock_fable5_thinking_effort_is_threaded_into_route_params():
@@ -183,3 +203,41 @@ def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkey
     )
     # And the host env being empty did not strip it — it came from the run env.
     assert BEDROCK_THINKING_EFFORT_ENV not in os.environ
+
+
+# --------------------------------------------------------------------------- #
+# Clamp end-to-end + drift guard (issue #737)                                 #
+# --------------------------------------------------------------------------- #
+
+
+def test_proxy_env_max_override_clamps_in_wire_payload(monkeypatch):
+    """Guards PR #739 against #737's proxy-process env clamp regression.
+
+    A run-level `max` in the proxy process env is clamped to the LiteLLM ceiling
+    in the actual wire payload; it must not raise.
+    """
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    assert _wire_output_config_effort("high", env_override="max") == "high"
+
+
+def test_every_requestable_bedrock_effort_survives_real_litellm_transform(monkeypatch):
+    """Guards PR #739 against #737's accepted-effort drift regression.
+
+    Every effort a user may request must, after the route clamp, be accepted by
+    the REAL litellm Converse transform (no BadRequestError). If a future
+    litellm changes its accepted set, this fails and forces
+    _BEDROCK_LITELLM_MAX_EFFORT / the ladder to be revisited rather than
+    silently shipping a value the provider rejects mid-run.
+    """
+    from benchflow.providers.litellm_config import _BEDROCK_EFFORT_LADDER
+
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    for requested in _BEDROCK_EFFORT_LADDER:
+        route = resolve_litellm_route(
+            "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0",
+            {**_BEDROCK_BASE_ENV, BEDROCK_THINKING_EFFORT_ENV: requested},
+        )
+        clamped = route.litellm_params["reasoning_effort"]
+        # Must not raise — this is the whole point of the clamp.
+        effort = _wire_output_config_effort(clamped, env_override=None)
+        assert effort, f"{requested!r} -> {clamped!r} produced no effort"
diff --git a/tests/test_litellm_config.py b/tests/test_litellm_config.py
@@ -21,6 +21,7 @@ def test_bedrock_model_maps_to_litellm_bedrock_route():
 
 
 def test_bedrock_model_honors_max_thinking_effort_env():
+    """Guards PR #739 against #737's route-config effort ceiling regression."""
     route = resolve_litellm_route(
         "aws-bedrock/us.anthropic.claude-opus-4-8",
         {
@@ -30,7 +31,10 @@ def test_bedrock_model_honors_max_thinking_effort_env():
         },
     )
 
-    assert route.litellm_params["reasoning_effort"] == "max"
+    # `max` is honored as "the highest supported effort": LiteLLM 1.88.0rc1
+    # rejects `max`/`xhigh` for opus-4-8, so BenchFlow clamps to the accepted
+    # ceiling `high` rather than erroring at request time (#737).
+    assert route.litellm_params["reasoning_effort"] == "high"
 
 
 def test_azure_openai_route_uses_resource_and_preview_version():