From dd41cd9a3e61dd6317e588f61e369697018ef19c Mon Sep 17 00:00:00 2001
From: ElegantLin <elegant.lin21@gmail.com>
Date: Sat, 13 Jun 2026 18:00:32 +0000
Subject: [PATCH 1/2] fix: clamp Bedrock thinking effort to the
 LiteLLM-accepted ceiling (#737)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BenchFlow accepts BENCHFLOW_BEDROCK_THINKING_EFFORT values `xhigh`/`max`, but
LiteLLM 1.88.0rc1's Bedrock Converse transform rejects them for adaptive Claude
4.8+ with a BadRequestError. So the #598 "MAX thinking" config errored at
request time instead of running at the highest supported effort.

Clamp a requested effort to the accepted ceiling (`high`) in both injection
paths — the route config (litellm_config) and the standalone proxy patch
(litellm_bedrock_patch, which cannot import benchflow, so it carries its own
in-sync copy of the ladder). This matches the existing garbage->high default:
a MAX-thinking run now proceeds at the real maximum rather than crashing.

Tests:
- clamp asserted on the route param (xhigh/max -> high) and updated the two
  pre-existing tests that encoded the bug (effort `max` threaded through);
- proxy-env override `max` clamps in the real wire payload without raising;
- drift guard: every requestable effort, after the clamp, is accepted by the
  REAL litellm Converse transform — if a future litellm changes its accepted
  set this fails loudly instead of shipping a rejected value mid-run.
Both clamps verified to fail-then-pass when individually neutered.

Closes #737

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../providers/litellm_bedrock_patch.py        | 20 ++++++-
 src/benchflow/providers/litellm_config.py     | 21 +++++++-
 tests/test_bedrock_thinking.py                | 54 ++++++++++++++++++-
 tests/test_litellm_config.py                  |  5 +-
 4 files changed, 94 insertions(+), 6 deletions(-)

diff --git a/src/benchflow/providers/litellm_bedrock_patch.py b/src/benchflow/providers/litellm_bedrock_patch.py
index b98f2c03..5ef6a1f6 100644
--- a/src/benchflow/providers/litellm_bedrock_patch.py
+++ b/src/benchflow/providers/litellm_bedrock_patch.py
@@ -18,7 +18,22 @@
     r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))"
 )
 
-_VALID_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"}
+# Requested efforts low→high. LiteLLM 1.88.0rc1 only accepts up to ``high`` for
+# adaptive Claude 4.8+ and raises on ``xhigh``/``max`` (#737), so an override is
+# clamped to the accepted ceiling before being handed to the transform. This is
+# the standalone (sandbox-deployed) copy of the ladder in ``litellm_config``;
+# keep the two in sync.
+_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max")
+_VALID_EFFORTS = set(_EFFORT_LADDER)
+_LITELLM_MAX_EFFORT = "high"
+
+
+def _clamp_effort(effort: str) -> str:
+    if effort not in _EFFORT_LADDER:
+        return effort
+    return _EFFORT_LADDER[
+        min(_EFFORT_LADDER.index(effort), _EFFORT_LADDER.index(_LITELLM_MAX_EFFORT))
+    ]
 
 
 def _is_new_bedrock_claude(model: str | None) -> bool:
@@ -68,6 +83,9 @@ def handle(
             override = os.environ.get(BEDROCK_THINKING_EFFORT_ENV, "").strip().lower()
             if override in _VALID_EFFORTS:
                 reasoning_effort = override
+            # Clamp xhigh/max (which litellm rejects) to the accepted ceiling so
+            # the request runs at the real maximum instead of raising (#737).
+            reasoning_effort = _clamp_effort(reasoning_effort)
         return original(self, model, reasoning_effort, optional_params)
 
     # Marker for the fail-closed startup preflight (#602): lets the runtime
diff --git a/src/benchflow/providers/litellm_config.py b/src/benchflow/providers/litellm_config.py
index 870b6d5f..810dab37 100644
--- a/src/benchflow/providers/litellm_config.py
+++ b/src/benchflow/providers/litellm_config.py
@@ -68,7 +68,24 @@ def custom_cost_per_token(model: str) -> tuple[float, float] | None:
     r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))",
     re.IGNORECASE,
 )
-_BEDROCK_THINKING_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"}
+# Efforts a user may *request*, low→high. LiteLLM 1.88.0rc1's Bedrock Converse
+# transform only accepts up to ``high`` for adaptive Claude 4.8+ and raises
+# BadRequestError on ``xhigh``/``max`` (#737), so requested values are clamped
+# to the accepted ceiling below before they reach the wire — a MAX-thinking
+# config then runs at the real maximum instead of erroring mid-run. Kept in
+# sync with the standalone proxy patch ``litellm_bedrock_patch`` (which cannot
+# import this module — it is deployed into the sandbox alone).
+_BEDROCK_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max")
+_BEDROCK_THINKING_EFFORTS = set(_BEDROCK_EFFORT_LADDER)
+_BEDROCK_LITELLM_MAX_EFFORT = "high"
+
+
+def _clamp_bedrock_effort(effort: str) -> str:
+    """Clamp a requested effort to the highest LiteLLM-accepted rung (#737)."""
+    ladder = _BEDROCK_EFFORT_LADDER
+    if effort not in ladder:
+        return effort
+    return ladder[min(ladder.index(effort), ladder.index(_BEDROCK_LITELLM_MAX_EFFORT))]
 
 
 @dataclass(frozen=True)
@@ -148,7 +165,7 @@ def _bedrock_thinking_effort(model: str, env: dict[str, str]) -> str | None:
     effort = (env.get(BEDROCK_THINKING_EFFORT_ENV) or "high").strip().lower()
     if effort not in _BEDROCK_THINKING_EFFORTS:
         effort = "high"
-    return effort
+    return _clamp_bedrock_effort(effort)
 
 
 def _route_registered_provider(
diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py
index 1061698f..27cf8b0f 100644
--- a/tests/test_bedrock_thinking.py
+++ b/tests/test_bedrock_thinking.py
@@ -42,12 +42,29 @@ def test_bedrock_thinking_effort_is_threaded_into_route_params():
         {
             "AWS_BEARER_TOKEN_BEDROCK": "token",
             "AWS_REGION": "us-west-2",
-            BEDROCK_THINKING_EFFORT_ENV: "max",
+            BEDROCK_THINKING_EFFORT_ENV: "medium",
         },
     )
 
     assert route.upstream_model == "bedrock/us.anthropic.claude-opus-4-8"
-    assert route.litellm_params["reasoning_effort"] == "max"
+    assert route.litellm_params["reasoning_effort"] == "medium"
+
+
+@pytest.mark.parametrize("requested", ["xhigh", "max"])
+def test_bedrock_effort_clamps_unsupported_to_litellm_ceiling(requested):
+    """Guards #737: LiteLLM rejects xhigh/max for opus-4-8 with a
+    BadRequestError, so BenchFlow clamps a requested xhigh/max down to the
+    accepted ceiling (`high`) — a MAX-thinking config runs at the real maximum
+    instead of erroring mid-run, matching the garbage->high default behavior."""
+    route = resolve_litellm_route(
+        "aws-bedrock/us.anthropic.claude-opus-4-8",
+        {
+            "AWS_BEARER_TOKEN_BEDROCK": "token",
+            "AWS_REGION": "us-west-2",
+            BEDROCK_THINKING_EFFORT_ENV: requested,
+        },
+    )
+    assert route.litellm_params["reasoning_effort"] == "high"
 
 
 def test_bedrock_fable5_thinking_effort_is_threaded_into_route_params():
@@ -183,3 +200,36 @@ def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkey
     )
     # And the host env being empty did not strip it — it came from the run env.
     assert BEDROCK_THINKING_EFFORT_ENV not in os.environ
+
+
+# --------------------------------------------------------------------------- #
+# Clamp end-to-end + drift guard (issue #737)                                 #
+# --------------------------------------------------------------------------- #
+
+
+def test_proxy_env_max_override_clamps_in_wire_payload(monkeypatch):
+    """#737 (patch path): a run-level `max` in the proxy process env is clamped
+    to the LiteLLM ceiling in the actual wire payload — it must NOT raise."""
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    assert _wire_output_config_effort("high", env_override="max") == "high"
+
+
+def test_every_requestable_bedrock_effort_survives_real_litellm_transform(monkeypatch):
+    """Drift guard for #737: every effort a user may request must, after the
+    route clamp, be accepted by the REAL litellm Converse transform (no
+    BadRequestError). If a future litellm changes its accepted set, this fails
+    and forces _BEDROCK_LITELLM_MAX_EFFORT / the ladder to be revisited rather
+    than silently shipping a value the provider rejects mid-run.
+    """
+    from benchflow.providers.litellm_config import _BEDROCK_EFFORT_LADDER
+
+    monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
+    for requested in _BEDROCK_EFFORT_LADDER:
+        route = resolve_litellm_route(
+            "aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0",
+            {**_BEDROCK_BASE_ENV, BEDROCK_THINKING_EFFORT_ENV: requested},
+        )
+        clamped = route.litellm_params["reasoning_effort"]
+        # Must not raise — this is the whole point of the clamp.
+        effort = _wire_output_config_effort(clamped, env_override=None)
+        assert effort, f"{requested!r} -> {clamped!r} produced no effort"
diff --git a/tests/test_litellm_config.py b/tests/test_litellm_config.py
index 3d9d2eb5..a1310787 100644
--- a/tests/test_litellm_config.py
+++ b/tests/test_litellm_config.py
@@ -30,7 +30,10 @@ def test_bedrock_model_honors_max_thinking_effort_env():
         },
     )
 
-    assert route.litellm_params["reasoning_effort"] == "max"
+    # `max` is honored as "the highest supported effort": LiteLLM 1.88.0rc1
+    # rejects `max`/`xhigh` for opus-4-8, so BenchFlow clamps to the accepted
+    # ceiling `high` rather than erroring at request time (#737).
+    assert route.litellm_params["reasoning_effort"] == "high"
 
 
 def test_azure_openai_route_uses_resource_and_preview_version():

From be86095bc511d65d8e11e93d095e17028a91943e Mon Sep 17 00:00:00 2001
From: Bingran You <bingran.you@berkeley.edu>
Date: Mon, 15 Jun 2026 01:53:30 -0400
Subject: [PATCH 2/2] fix: scope Bedrock effort clamp to LiteLLM-limited models

---
 .../providers/litellm_bedrock_patch.py        | 22 ++++++++++----
 src/benchflow/providers/litellm_config.py     | 21 ++++++++-----
 tests/test_bedrock_thinking.py                | 30 ++++++++++++-------
 tests/test_litellm_config.py                  |  1 +
 4 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/src/benchflow/providers/litellm_bedrock_patch.py b/src/benchflow/providers/litellm_bedrock_patch.py
index 5ef6a1f6..fc3cf3d6 100644
--- a/src/benchflow/providers/litellm_bedrock_patch.py
+++ b/src/benchflow/providers/litellm_bedrock_patch.py
@@ -17,12 +17,16 @@
 BEDROCK_ADAPTIVE_THINKING_RE = re.compile(
     r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))"
 )
+BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile(
+    r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)"
+)
 
 # Requested efforts low→high. LiteLLM 1.88.0rc1 only accepts up to ``high`` for
-# adaptive Claude 4.8+ and raises on ``xhigh``/``max`` (#737), so an override is
-# clamped to the accepted ceiling before being handed to the transform. This is
-# the standalone (sandbox-deployed) copy of the ladder in ``litellm_config``;
-# keep the two in sync.
+# the Claude 4.x Bedrock IDs matched by ``BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and
+# raises on ``xhigh``/``max`` (#737), so those overrides are clamped before being
+# handed to the transform. Other adaptive-thinking Bedrock models, such as Fable
+# 5, keep their requested effort. This is the standalone (sandbox-deployed) copy
+# of the ladder in ``litellm_config``; keep the two in sync.
 _EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max")
 _VALID_EFFORTS = set(_EFFORT_LADDER)
 _LITELLM_MAX_EFFORT = "high"
@@ -43,6 +47,13 @@ def _is_new_bedrock_claude(model: str | None) -> bool:
         return False
 
 
+def _should_clamp_effort(model: str | None) -> bool:
+    try:
+        return bool(model and BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model.lower()))
+    except Exception:
+        return False
+
+
 def _patch_anthropic_gate() -> None:
     try:
         from litellm.llms.anthropic.chat.transformation import AnthropicConfig
@@ -85,7 +96,8 @@ def handle(
                 reasoning_effort = override
             # Clamp xhigh/max (which litellm rejects) to the accepted ceiling so
             # the request runs at the real maximum instead of raising (#737).
-            reasoning_effort = _clamp_effort(reasoning_effort)
+            if _should_clamp_effort(model):
+                reasoning_effort = _clamp_effort(reasoning_effort)
         return original(self, model, reasoning_effort, optional_params)
 
     # Marker for the fail-closed startup preflight (#602): lets the runtime
diff --git a/src/benchflow/providers/litellm_config.py b/src/benchflow/providers/litellm_config.py
index 810dab37..c9ef3f46 100644
--- a/src/benchflow/providers/litellm_config.py
+++ b/src/benchflow/providers/litellm_config.py
@@ -68,13 +68,18 @@ def custom_cost_per_token(model: str) -> tuple[float, float] | None:
     r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))",
     re.IGNORECASE,
 )
+_BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile(
+    r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)",
+    re.IGNORECASE,
+)
 # Efforts a user may *request*, low→high. LiteLLM 1.88.0rc1's Bedrock Converse
-# transform only accepts up to ``high`` for adaptive Claude 4.8+ and raises
-# BadRequestError on ``xhigh``/``max`` (#737), so requested values are clamped
-# to the accepted ceiling below before they reach the wire — a MAX-thinking
-# config then runs at the real maximum instead of erroring mid-run. Kept in
-# sync with the standalone proxy patch ``litellm_bedrock_patch`` (which cannot
-# import this module — it is deployed into the sandbox alone).
+# transform only accepts up to ``high`` for the Claude 4.x Bedrock IDs covered
+# by ``_BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and raises BadRequestError on
+# ``xhigh``/``max`` (#737), so those requested values are clamped to the
+# accepted ceiling below before they reach the wire. Other adaptive-thinking
+# Bedrock models, such as Fable 5, keep their requested effort. Kept in sync
+# with the standalone proxy patch ``litellm_bedrock_patch`` (which cannot import
+# this module — it is deployed into the sandbox alone).
 _BEDROCK_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max")
 _BEDROCK_THINKING_EFFORTS = set(_BEDROCK_EFFORT_LADDER)
 _BEDROCK_LITELLM_MAX_EFFORT = "high"
@@ -165,7 +170,9 @@ def _bedrock_thinking_effort(model: str, env: dict[str, str]) -> str | None:
     effort = (env.get(BEDROCK_THINKING_EFFORT_ENV) or "high").strip().lower()
     if effort not in _BEDROCK_THINKING_EFFORTS:
         effort = "high"
-    return _clamp_bedrock_effort(effort)
+    if _BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model):
+        return _clamp_bedrock_effort(effort)
+    return effort
 
 
 def _route_registered_provider(
diff --git a/tests/test_bedrock_thinking.py b/tests/test_bedrock_thinking.py
index 27cf8b0f..88ee550a 100644
--- a/tests/test_bedrock_thinking.py
+++ b/tests/test_bedrock_thinking.py
@@ -52,10 +52,13 @@ def test_bedrock_thinking_effort_is_threaded_into_route_params():
 
 @pytest.mark.parametrize("requested", ["xhigh", "max"])
 def test_bedrock_effort_clamps_unsupported_to_litellm_ceiling(requested):
-    """Guards #737: LiteLLM rejects xhigh/max for opus-4-8 with a
-    BadRequestError, so BenchFlow clamps a requested xhigh/max down to the
-    accepted ceiling (`high`) — a MAX-thinking config runs at the real maximum
-    instead of erroring mid-run, matching the garbage->high default behavior."""
+    """Guards PR #739 against #737's unsupported Bedrock effort regression.
+
+    LiteLLM rejects xhigh/max for opus-4-8 with a BadRequestError, so BenchFlow
+    clamps a requested xhigh/max down to the accepted ceiling (`high`) -- a
+    MAX-thinking config runs at the real maximum instead of erroring mid-run,
+    matching the garbage->high default behavior.
+    """
     route = resolve_litellm_route(
         "aws-bedrock/us.anthropic.claude-opus-4-8",
         {
@@ -208,18 +211,23 @@ def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkey
 
 
 def test_proxy_env_max_override_clamps_in_wire_payload(monkeypatch):
-    """#737 (patch path): a run-level `max` in the proxy process env is clamped
-    to the LiteLLM ceiling in the actual wire payload — it must NOT raise."""
+    """Guards PR #739 against #737's proxy-process env clamp regression.
+
+    A run-level `max` in the proxy process env is clamped to the LiteLLM ceiling
+    in the actual wire payload; it must not raise.
+    """
     monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
     assert _wire_output_config_effort("high", env_override="max") == "high"
 
 
 def test_every_requestable_bedrock_effort_survives_real_litellm_transform(monkeypatch):
-    """Drift guard for #737: every effort a user may request must, after the
-    route clamp, be accepted by the REAL litellm Converse transform (no
-    BadRequestError). If a future litellm changes its accepted set, this fails
-    and forces _BEDROCK_LITELLM_MAX_EFFORT / the ladder to be revisited rather
-    than silently shipping a value the provider rejects mid-run.
+    """Guards PR #739 against #737's accepted-effort drift regression.
+
+    Every effort a user may request must, after the route clamp, be accepted by
+    the REAL litellm Converse transform (no BadRequestError). If a future
+    litellm changes its accepted set, this fails and forces
+    _BEDROCK_LITELLM_MAX_EFFORT / the ladder to be revisited rather than
+    silently shipping a value the provider rejects mid-run.
     """
     from benchflow.providers.litellm_config import _BEDROCK_EFFORT_LADDER
 
diff --git a/tests/test_litellm_config.py b/tests/test_litellm_config.py
index a1310787..bca13588 100644
--- a/tests/test_litellm_config.py
+++ b/tests/test_litellm_config.py
@@ -21,6 +21,7 @@ def test_bedrock_model_maps_to_litellm_bedrock_route():
 
 
 def test_bedrock_model_honors_max_thinking_effort_env():
+    """Guards PR #739 against #737's route-config effort ceiling regression."""
     route = resolve_litellm_route(
         "aws-bedrock/us.anthropic.claude-opus-4-8",
         {