Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 31 additions & 1 deletion src/benchflow/providers/litellm_bedrock_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,27 @@
BEDROCK_ADAPTIVE_THINKING_RE = re.compile(
r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))"
)
BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile(
r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)"
)

# Requested efforts low→high. LiteLLM 1.88.0rc1 only accepts up to ``high`` for
# the Claude 4.x Bedrock IDs matched by ``BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and
# raises on ``xhigh``/``max`` (#737), so those overrides are clamped before being
# handed to the transform. Other adaptive-thinking Bedrock models, such as Fable
# 5, keep their requested effort. This is the standalone (sandbox-deployed) copy
# of the ladder in ``litellm_config``; keep the two in sync.
_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max")
_VALID_EFFORTS = set(_EFFORT_LADDER)
_LITELLM_MAX_EFFORT = "high"

_VALID_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"}

def _clamp_effort(effort: str) -> str:
if effort not in _EFFORT_LADDER:
return effort
return _EFFORT_LADDER[
min(_EFFORT_LADDER.index(effort), _EFFORT_LADDER.index(_LITELLM_MAX_EFFORT))
]


def _is_new_bedrock_claude(model: str | None) -> bool:
Expand All @@ -28,6 +47,13 @@ def _is_new_bedrock_claude(model: str | None) -> bool:
return False


def _should_clamp_effort(model: str | None) -> bool:
try:
return bool(model and BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model.lower()))
except Exception:
return False


def _patch_anthropic_gate() -> None:
try:
from litellm.llms.anthropic.chat.transformation import AnthropicConfig
Expand Down Expand Up @@ -68,6 +94,10 @@ def handle(
override = os.environ.get(BEDROCK_THINKING_EFFORT_ENV, "").strip().lower()
if override in _VALID_EFFORTS:
reasoning_effort = override
# Clamp xhigh/max (which litellm rejects) to the accepted ceiling so
# the request runs at the real maximum instead of raising (#737).
if _should_clamp_effort(model):
reasoning_effort = _clamp_effort(reasoning_effort)
return original(self, model, reasoning_effort, optional_params)

# Marker for the fail-closed startup preflight (#602): lets the runtime
Expand Down
26 changes: 25 additions & 1 deletion src/benchflow/providers/litellm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,29 @@ def custom_cost_per_token(model: str) -> tuple[float, float] | None:
r"claude-(?:(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)|fable-5(?!\d))",
re.IGNORECASE,
)
_BEDROCK_THINKING_EFFORTS = {"minimal", "low", "medium", "high", "xhigh", "max"}
_BEDROCK_LITELLM_EFFORT_LIMIT_RE = re.compile(
r"claude-(?:opus|sonnet|haiku)-4-(?:8|9|1\d)(?!\d)",
re.IGNORECASE,
)
# Efforts a user may *request*, low→high. LiteLLM 1.88.0rc1's Bedrock Converse
# transform only accepts up to ``high`` for the Claude 4.x Bedrock IDs covered
# by ``_BEDROCK_LITELLM_EFFORT_LIMIT_RE`` and raises BadRequestError on
# ``xhigh``/``max`` (#737), so those requested values are clamped to the
# accepted ceiling below before they reach the wire. Other adaptive-thinking
# Bedrock models, such as Fable 5, keep their requested effort. Kept in sync
# with the standalone proxy patch ``litellm_bedrock_patch`` (which cannot import
# this module — it is deployed into the sandbox alone).
_BEDROCK_EFFORT_LADDER = ("minimal", "low", "medium", "high", "xhigh", "max")
_BEDROCK_THINKING_EFFORTS = set(_BEDROCK_EFFORT_LADDER)
_BEDROCK_LITELLM_MAX_EFFORT = "high"


def _clamp_bedrock_effort(effort: str) -> str:
"""Clamp a requested effort to the highest LiteLLM-accepted rung (#737)."""
ladder = _BEDROCK_EFFORT_LADDER
if effort not in ladder:
return effort
return ladder[min(ladder.index(effort), ladder.index(_BEDROCK_LITELLM_MAX_EFFORT))]


@dataclass(frozen=True)
Expand Down Expand Up @@ -148,6 +170,8 @@ def _bedrock_thinking_effort(model: str, env: dict[str, str]) -> str | None:
effort = (env.get(BEDROCK_THINKING_EFFORT_ENV) or "high").strip().lower()
if effort not in _BEDROCK_THINKING_EFFORTS:
effort = "high"
if _BEDROCK_LITELLM_EFFORT_LIMIT_RE.search(model):
return _clamp_bedrock_effort(effort)
return effort


Expand Down
62 changes: 60 additions & 2 deletions tests/test_bedrock_thinking.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,32 @@ def test_bedrock_thinking_effort_is_threaded_into_route_params():
{
"AWS_BEARER_TOKEN_BEDROCK": "token",
"AWS_REGION": "us-west-2",
BEDROCK_THINKING_EFFORT_ENV: "max",
BEDROCK_THINKING_EFFORT_ENV: "medium",
},
)

assert route.upstream_model == "bedrock/us.anthropic.claude-opus-4-8"
assert route.litellm_params["reasoning_effort"] == "max"
assert route.litellm_params["reasoning_effort"] == "medium"


@pytest.mark.parametrize("requested", ["xhigh", "max"])
def test_bedrock_effort_clamps_unsupported_to_litellm_ceiling(requested):
"""Guards PR #739 against #737's unsupported Bedrock effort regression.

LiteLLM rejects xhigh/max for opus-4-8 with a BadRequestError, so BenchFlow
clamps a requested xhigh/max down to the accepted ceiling (`high`) -- a
MAX-thinking config runs at the real maximum instead of erroring mid-run,
matching the garbage->high default behavior.
"""
route = resolve_litellm_route(
"aws-bedrock/us.anthropic.claude-opus-4-8",
{
"AWS_BEARER_TOKEN_BEDROCK": "token",
"AWS_REGION": "us-west-2",
BEDROCK_THINKING_EFFORT_ENV: requested,
},
)
assert route.litellm_params["reasoning_effort"] == "high"


def test_bedrock_fable5_thinking_effort_is_threaded_into_route_params():
Expand Down Expand Up @@ -183,3 +203,41 @@ def test_docker_and_daytona_resolve_identical_bedrock_effort_from_run_env(monkey
)
# And the host env being empty did not strip it — it came from the run env.
assert BEDROCK_THINKING_EFFORT_ENV not in os.environ


# --------------------------------------------------------------------------- #
# Clamp end-to-end + drift guard (issue #737) #
# --------------------------------------------------------------------------- #


def test_proxy_env_max_override_clamps_in_wire_payload(monkeypatch):
"""Guards PR #739 against #737's proxy-process env clamp regression.

A run-level `max` in the proxy process env is clamped to the LiteLLM ceiling
in the actual wire payload; it must not raise.
"""
monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
assert _wire_output_config_effort("high", env_override="max") == "high"


def test_every_requestable_bedrock_effort_survives_real_litellm_transform(monkeypatch):
"""Guards PR #739 against #737's accepted-effort drift regression.

Every effort a user may request must, after the route clamp, be accepted by
the REAL litellm Converse transform (no BadRequestError). If a future
litellm changes its accepted set, this fails and forces
_BEDROCK_LITELLM_MAX_EFFORT / the ladder to be revisited rather than
silently shipping a value the provider rejects mid-run.
"""
from benchflow.providers.litellm_config import _BEDROCK_EFFORT_LADDER

monkeypatch.delenv(BEDROCK_THINKING_EFFORT_ENV, raising=False)
for requested in _BEDROCK_EFFORT_LADDER:
route = resolve_litellm_route(
"aws-bedrock/us.anthropic.claude-opus-4-8-20251101-v1:0",
{**_BEDROCK_BASE_ENV, BEDROCK_THINKING_EFFORT_ENV: requested},
)
clamped = route.litellm_params["reasoning_effort"]
# Must not raise — this is the whole point of the clamp.
effort = _wire_output_config_effort(clamped, env_override=None)
assert effort, f"{requested!r} -> {clamped!r} produced no effort"
6 changes: 5 additions & 1 deletion tests/test_litellm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ def test_bedrock_model_maps_to_litellm_bedrock_route():


def test_bedrock_model_honors_max_thinking_effort_env():
"""Guards PR #739 against #737's route-config effort ceiling regression."""
route = resolve_litellm_route(
"aws-bedrock/us.anthropic.claude-opus-4-8",
{
Expand All @@ -30,7 +31,10 @@ def test_bedrock_model_honors_max_thinking_effort_env():
},
)

assert route.litellm_params["reasoning_effort"] == "max"
# `max` is honored as "the highest supported effort": LiteLLM 1.88.0rc1
# rejects `max`/`xhigh` for opus-4-8, so BenchFlow clamps to the accepted
# ceiling `high` rather than erroring at request time (#737).
assert route.litellm_params["reasoning_effort"] == "high"


def test_azure_openai_route_uses_resource_and_preview_version():
Expand Down
Loading