From b4012401cd8c9c422b132d219173ff6a7924ddfe Mon Sep 17 00:00:00 2001 From: Kenny Wong Date: Fri, 29 May 2026 07:56:54 -0400 Subject: [PATCH 1/3] [OAI] Allow forcing Responses API for non-gpt-5 model names Proxy/internal setups can serve a GPT-5 model under a name that doesn't start with "gpt-5", so the name-based isGPT5Model() check alone can't route them to the Responses API. Add a per-call use_responses_api / useResponsesApi flag (camelCase at the scorer layer, snake_case in CachedLLMParams) so callers can force it; the flag is stripped before the request is sent. --- js/llm.test.ts | 155 +++++++++++++++++++++++++++++++++++++++ js/llm.ts | 14 ++++ js/oai.ts | 28 +++++-- py/autoevals/llm.py | 6 ++ py/autoevals/oai.py | 8 +- py/autoevals/test_llm.py | 49 +++++++++++++ 6 files changed, 253 insertions(+), 7 deletions(-) diff --git a/js/llm.test.ts b/js/llm.test.ts index 7390d44..0c9ee7d 100644 --- a/js/llm.test.ts +++ b/js/llm.test.ts @@ -338,6 +338,161 @@ Issue Description: {{page_content}} expect(capturedRequestBody.max_tokens).toBeUndefined(); }); + test("useResponsesApi forces the Responses API for a non-gpt-5 model", async () => { + let responsesHit = false; + let chatCompletionsHit = false; + + server.use( + http.post("https://api.openai.com/v1/responses", async ({ request }) => { + responsesHit = true; + const body = (await request.json()) as any; + // The control flag must be stripped before reaching the API. + expect(body.use_responses_api).toBeUndefined(); + expect(body.useResponsesApi).toBeUndefined(); + return HttpResponse.json({ + id: "resp-test", + object: "response", + created: 1234567890, + model: body.model, + output: [ + { + type: "function_call", + call_id: "call_test", + name: "select_choice", + arguments: JSON.stringify({ choice: "1" }), + }, + ], + }); + }), + http.post( + "https://api.openai.com/v1/chat/completions", + async ({ request }) => { + chatCompletionsHit = true; + const body = (await request.json()) as any; + return HttpResponse.json({ + id: "chatcmpl-test", + object: "chat.completion", + created: 1234567890, + model: body.model, + choices: [ + { + index: 0, + message: { + role: "assistant", + content: null, + tool_calls: [ + { + id: "call_test", + type: "function", + function: { + name: "select_choice", + arguments: JSON.stringify({ choice: "1" }), + }, + }, + ], + }, + finish_reason: "stop", + }, + ], + }); + }, + ), + ); + + init({ + client: new OpenAI({ + apiKey: "test-api-key", + baseURL: "https://api.openai.com/v1", + }), + }); + + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Test prompt: {{output}} vs {{expected}}", + choiceScores: { "1": 1, "2": 0 }, + }); + + // A proxy-served model that does NOT start with "gpt-5". + const result = await classifier({ + output: "test output", + expected: "test expected", + model: "internal-proxy-model", + useResponsesApi: true, + }); + + expect(result.error).toBeUndefined(); + expect(responsesHit).toBe(true); + expect(chatCompletionsHit).toBe(false); + }); + + test("non-gpt-5 model uses Chat Completions when useResponsesApi is not set", async () => { + let responsesHit = false; + let chatCompletionsHit = false; + + server.use( + http.post("https://api.openai.com/v1/responses", async () => { + responsesHit = true; + return HttpResponse.json({}); + }), + http.post( + "https://api.openai.com/v1/chat/completions", + async ({ request }) => { + chatCompletionsHit = true; + const body = (await request.json()) as any; + return HttpResponse.json({ + id: "chatcmpl-test", + object: "chat.completion", + created: 1234567890, + model: body.model, + choices: [ + { + index: 0, + message: { + role: "assistant", + content: null, + tool_calls: [ + { + id: "call_test", + type: "function", + function: { + name: "select_choice", + arguments: JSON.stringify({ choice: "1" }), + }, + }, + ], + }, + finish_reason: "stop", + }, + ], + }); + }, + ), + ); + + init({ + client: new OpenAI({ + apiKey: "test-api-key", + baseURL: "https://api.openai.com/v1", + }), + }); + + const classifier = LLMClassifierFromTemplate({ + name: "test", + promptTemplate: "Test prompt: {{output}} vs {{expected}}", + choiceScores: { "1": 1, "2": 0 }, + }); + + const result = await classifier({ + output: "test output", + expected: "test expected", + model: "gpt-4o-mini", + }); + + expect(result.error).toBeUndefined(); + expect(chatCompletionsHit).toBe(true); + expect(responsesHit).toBe(false); + }); + test("LLMClassifierFromTemplate uses configured default model", async () => { let capturedModel: string | undefined; diff --git a/js/llm.ts b/js/llm.ts index bb50b30..2826fab 100644 --- a/js/llm.ts +++ b/js/llm.ts @@ -73,6 +73,12 @@ export type LLMArgs = { reasoningEffort?: ReasoningEffort; reasoningEnabled?: boolean; reasoningBudget?: number; + /** + * Force the request to use the Responses API, even when the model name does + * not start with "gpt-5". Useful for proxy/internal setups that serve a + * Responses-only model under a non-matching name. + */ + useResponsesApi?: boolean; } & OpenAIAuth; /** @@ -166,6 +172,7 @@ export async function OpenAIClassifier( reasoningEffort, reasoningEnabled, reasoningBudget, + useResponsesApi, cache, ...remainingRenderArgs } = remaining; @@ -176,6 +183,7 @@ export async function OpenAIClassifier( reasoning_effort?: ReasoningEffort; reasoning_enabled?: boolean; reasoning_budget?: number; + use_responses_api?: boolean; } = {}; if (temperature !== undefined) { extraArgs.temperature = temperature; @@ -192,6 +200,9 @@ export async function OpenAIClassifier( if (reasoningBudget !== undefined) { extraArgs.reasoning_budget = reasoningBudget; } + if (useResponsesApi !== undefined) { + extraArgs.use_responses_api = useResponsesApi; + } const renderArgs = { output, @@ -293,6 +304,7 @@ export function LLMClassifierFromTemplate({ reasoningEffort, reasoningEnabled, reasoningBudget, + useResponsesApi, }: { name: string; promptTemplate: string; @@ -304,6 +316,7 @@ export function LLMClassifierFromTemplate({ reasoningEffort?: ReasoningEffort; reasoningEnabled?: boolean; reasoningBudget?: number; + useResponsesApi?: boolean; }): Scorer> { const choiceStrings = Object.keys(choiceScores); const ret = async ( @@ -352,6 +365,7 @@ export function LLMClassifierFromTemplate({ reasoningEffort, reasoningEnabled, reasoningBudget, + useResponsesApi, __choices: choiceStrings, // Thread template vars come first so explicit args can override ...threadVars, diff --git a/js/oai.ts b/js/oai.ts index 3161467..bd29315 100644 --- a/js/oai.ts +++ b/js/oai.ts @@ -19,6 +19,12 @@ export interface CachedLLMParams { temperature?: number; max_tokens?: number; reasoning_effort?: ReasoningEffort; + /** + * Force the request to use the Responses API, even when the model name does + * not start with "gpt-5". Useful for proxy/internal setups that serve a + * Responses-only model under a name that doesn't match {@link isGPT5Model}. + */ + use_responses_api?: boolean; span_info?: { spanAttributes?: Record; }; @@ -295,26 +301,38 @@ function isGPT5Model(model: string): boolean { return model.startsWith("gpt-5"); } +/** + * Whether to route the request through the Responses API. GPT-5 models require + * it, and callers can force it via `useResponsesApi` for proxy/internal setups + * that serve a Responses-only model under a name that doesn't start with "gpt-5". + */ +function isForcedResponsesMode(params: CachedLLMParams): boolean { + return isGPT5Model(params.model) || params.use_responses_api === true; +} + export async function cachedChatCompletion( params: CachedLLMParams, options: { cache?: ChatCache } & OpenAIAuth, ): Promise { const openai = buildOpenAIClient(options); + // Strip use_responses_api so it is never forwarded to either API. + const { use_responses_api: _useResponsesApi, ...completionParams } = params; + const fullParams = globalThis.__inherited_braintrust_wrap_openai ? { - ...params, + ...completionParams, span_info: { spanAttributes: { - ...params.span_info?.spanAttributes, + ...completionParams.span_info?.spanAttributes, purpose: "scorer", }, }, } - : params; + : completionParams; - // GPT-5 models require the Responses API - if (isGPT5Model(params.model)) { + // GPT-5 models require the Responses API; callers may also force it. + if (isForcedResponsesMode(params)) { // Convert Chat Completions API params to Responses API params const responsesParams: any = { model: fullParams.model, diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py index e12baea..e79a45e 100644 --- a/py/autoevals/llm.py +++ b/py/autoevals/llm.py @@ -180,6 +180,7 @@ def __init__( reasoning_effort=None, reasoning_enabled=None, reasoning_budget=None, + use_responses_api=None, engine=None, api_key=None, base_url=None, @@ -210,6 +211,9 @@ def __init__( if reasoning_budget is not None: self.extra_args["reasoning_budget"] = reasoning_budget + if use_responses_api is not None: + self.extra_args["use_responses_api"] = use_responses_api + self.render_args = {} if render_args: self.render_args.update(render_args) @@ -366,6 +370,7 @@ def __init__( reasoning_effort=None, reasoning_enabled=None, reasoning_budget=None, + use_responses_api=None, engine=None, api_key=None, base_url=None, @@ -397,6 +402,7 @@ def __init__( reasoning_effort=reasoning_effort, reasoning_enabled=reasoning_enabled, reasoning_budget=reasoning_budget, + use_responses_api=use_responses_api, engine=engine, api_key=api_key, base_url=base_url, diff --git a/py/autoevals/oai.py b/py/autoevals/oai.py index fa21554..56b5ed2 100644 --- a/py/autoevals/oai.py +++ b/py/autoevals/oai.py @@ -320,7 +320,9 @@ def prepare_responses_params(kwargs: dict[str, Any]) -> dict[str, Any]: async def complete_wrapper(**kwargs: Any) -> Any: model = kwargs.get("model", "") - if is_gpt5_model(model): + # Strip use_responses_api so it is never forwarded to either API. + use_responses_api = kwargs.pop("use_responses_api", False) + if is_gpt5_model(model) or use_responses_api: responses_params = prepare_responses_params(kwargs) response = await responses_create(**responses_params) return convert_responses_to_chat_completion(response) @@ -330,7 +332,9 @@ async def complete_wrapper(**kwargs: Any) -> Any: def complete_wrapper(**kwargs: Any) -> Any: model = kwargs.get("model", "") - if is_gpt5_model(model): + # Strip use_responses_api so it is never forwarded to either API. + use_responses_api = kwargs.pop("use_responses_api", False) + if is_gpt5_model(model) or use_responses_api: responses_params = prepare_responses_params(kwargs) response = responses_create(**responses_params) return convert_responses_to_chat_completion(response) diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py index ba97b94..b37c395 100644 --- a/py/autoevals/test_llm.py +++ b/py/autoevals/test_llm.py @@ -489,6 +489,55 @@ def capture_model(request): init(None) +@respx.mock +def test_use_responses_api_forces_responses_for_non_gpt5_model(): + """use_responses_api should route a non-gpt-5 model through the Responses API.""" + responses_route = respx.route(method="POST", path__regex=r".*/responses$").mock( + return_value=Response( + 200, + json={ + "id": "resp-test", + "object": "response", + "created": 1234567890, + "model": "internal-proxy-model", + "output": [ + { + "type": "function_call", + "call_id": "call_test", + "name": "select_choice", + "arguments": '{"choice": "1"}', + } + ], + }, + ) + ) + chat_route = respx.route(method="POST", path__regex=r".*/chat/completions$").mock( + return_value=Response(200, json={}) + ) + + client = OpenAI(api_key="test-api-key", base_url="https://api.openai.com/v1") + init(client) + + classifier = LLMClassifier( + name="test", + prompt_template="Test prompt: {{output}}", + choice_scores={"1": 1, "2": 0}, + model="internal-proxy-model", + use_responses_api=True, + ) + + result = classifier.eval(output="test output", expected="test expected") + + assert result.score == 1 + assert responses_route.called + assert not chat_route.called + # use_responses_api must be stripped before reaching the API. + body = json.loads(responses_route.calls[0].request.content.decode("utf-8")) + assert "use_responses_api" not in body + + init(None) + + @respx.mock def test_llm_classifier_injects_thread_vars_from_trace(): captured_request_body = None From ad9d5a54b028f1a2a2a54adcbc41fb356c352611 Mon Sep 17 00:00:00 2001 From: Kenny Wong Date: Fri, 29 May 2026 08:05:56 -0400 Subject: [PATCH 2/3] [OAI] Thread use_responses_api through built-in named scorers SpecFileClassifier.__new__ has a fixed kwarg list, so Factuality(use_responses_api=True) and the other named scorers raised TypeError. Forward the flag like the other model knobs. --- py/autoevals/llm.py | 3 +++ py/autoevals/test_llm.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py index e79a45e..a027d42 100644 --- a/py/autoevals/llm.py +++ b/py/autoevals/llm.py @@ -504,6 +504,7 @@ def __new__( use_cot=None, max_tokens=None, temperature=None, + use_responses_api=None, api_key=None, base_url=None, client: Client | None = None, @@ -519,6 +520,8 @@ def __new__( kwargs["max_tokens"] = max_tokens if temperature is not None: kwargs["temperature"] = temperature + if use_responses_api is not None: + kwargs["use_responses_api"] = use_responses_api if api_key is not None: kwargs["api_key"] = api_key if base_url is not None: diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py index b37c395..ddc0a9e 100644 --- a/py/autoevals/test_llm.py +++ b/py/autoevals/test_llm.py @@ -538,6 +538,45 @@ def test_use_responses_api_forces_responses_for_non_gpt5_model(): init(None) +@respx.mock +def test_use_responses_api_on_builtin_scorer(): + """Built-in named scorers (SpecFileClassifier) should accept use_responses_api.""" + responses_route = respx.route(method="POST", path__regex=r".*/responses$").mock( + return_value=Response( + 200, + json={ + "id": "resp-test", + "object": "response", + "created": 1234567890, + "model": "gpt-4.1", + "output": [ + { + "type": "function_call", + "call_id": "call_test", + "name": "select_choice", + "arguments": '{"reasons": "same", "choice": "C"}', + } + ], + }, + ) + ) + chat_route = respx.route(method="POST", path__regex=r".*/chat/completions$").mock( + return_value=Response(200, json={}) + ) + + init(OpenAI(api_key="test-api-key", base_url="https://api.openai.com/v1")) + + result = Factuality(model="gpt-4.1", use_responses_api=True).eval( + output="6", expected="6", input="Add the numbers 1, 2, 3" + ) + + assert result.score == 1 + assert responses_route.called + assert not chat_route.called + + init(None) + + @respx.mock def test_llm_classifier_injects_thread_vars_from_trace(): captured_request_body = None From a780f0aaaccee81d9f317bedc18bc65a00f21af6 Mon Sep 17 00:00:00 2001 From: Kenny Wong Date: Fri, 29 May 2026 08:19:50 -0400 Subject: [PATCH 3/3] [OAI] Map reasoning_effort to reasoning.effort for the Responses API The Responses API rejects a top-level reasoning_effort param ("moved to reasoning.effort"), so reasoning calls routed to it 400'd. Nest it correctly in both languages. --- js/llm.test.ts | 4 ++++ js/oai.ts | 3 ++- py/autoevals/oai.py | 8 +++++--- py/autoevals/test_llm.py | 6 +++++- 4 files changed, 16 insertions(+), 5 deletions(-) diff --git a/js/llm.test.ts b/js/llm.test.ts index 0c9ee7d..2b1b18b 100644 --- a/js/llm.test.ts +++ b/js/llm.test.ts @@ -329,6 +329,7 @@ Issue Description: {{page_content}} choiceScores: { "1": 1, "2": 0 }, maxTokens: 256, temperature: 0.5, + reasoningEffort: "medium", }); await classifier({ output: "test output", expected: "test expected" }); @@ -336,6 +337,9 @@ Issue Description: {{page_content}} // Verify that temperature is in the request (max_tokens not supported by Responses API) expect(capturedRequestBody.temperature).toBe(0.5); expect(capturedRequestBody.max_tokens).toBeUndefined(); + // The Responses API nests reasoning effort under reasoning.effort. + expect(capturedRequestBody.reasoning).toEqual({ effort: "medium" }); + expect(capturedRequestBody.reasoning_effort).toBeUndefined(); }); test("useResponsesApi forces the Responses API for a non-gpt-5 model", async () => { diff --git a/js/oai.ts b/js/oai.ts index bd29315..1515718 100644 --- a/js/oai.ts +++ b/js/oai.ts @@ -380,7 +380,8 @@ export async function cachedChatCompletion( } // Note: max_tokens is not supported by Responses API if (fullParams.reasoning_effort) { - responsesParams.reasoning_effort = fullParams.reasoning_effort; + // The Responses API nests this under reasoning.effort, unlike Chat Completions. + responsesParams.reasoning = { effort: fullParams.reasoning_effort }; } const response: any = await openai.responses.create(responsesParams); diff --git a/py/autoevals/oai.py b/py/autoevals/oai.py index 56b5ed2..2ccb50f 100644 --- a/py/autoevals/oai.py +++ b/py/autoevals/oai.py @@ -310,9 +310,11 @@ def prepare_responses_params(kwargs: dict[str, Any]) -> dict[str, Any]: responses_params["tool_choice"] = "required" # Copy supported parameters - for key in ["temperature", "reasoning_effort"]: - if key in kwargs: - responses_params[key] = kwargs[key] + if "temperature" in kwargs: + responses_params["temperature"] = kwargs["temperature"] + # The Responses API nests this under reasoning.effort, unlike Chat Completions. + if "reasoning_effort" in kwargs: + responses_params["reasoning"] = {"effort": kwargs["reasoning_effort"]} return responses_params diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py index ddc0a9e..a1c2e01 100644 --- a/py/autoevals/test_llm.py +++ b/py/autoevals/test_llm.py @@ -396,13 +396,14 @@ def capture_request(request): client = OpenAI(api_key="test-api-key", base_url="https://api.openai.com/v1") init(client) - # Create classifier with max_tokens and temperature specified + # Create classifier with max_tokens, temperature and reasoning_effort specified classifier = LLMClassifier( "test", "Test prompt: {{output}} vs {{expected}}", {"1": 1, "2": 0}, max_tokens=256, temperature=0.5, + reasoning_effort="medium", ) classifier.eval(output="test output", expected="test expected") @@ -410,6 +411,9 @@ def capture_request(request): # Verify that temperature is in the request with correct value (max_tokens not supported by Responses API) assert captured_request_body["temperature"] == 0.5 assert "max_tokens" not in captured_request_body + # The Responses API nests reasoning effort under reasoning.effort. + assert captured_request_body["reasoning"] == {"effort": "medium"} + assert "reasoning_effort" not in captured_request_body @respx.mock