From b4012401cd8c9c422b132d219173ff6a7924ddfe Mon Sep 17 00:00:00 2001
From: Kenny Wong <wong@coda.io>
Date: Fri, 29 May 2026 07:56:54 -0400
Subject: [PATCH 1/3] [OAI] Allow forcing Responses API for non-gpt-5 model
 names

Proxy/internal setups can serve a GPT-5 model under a name that doesn't
start with "gpt-5", so the name-based isGPT5Model() check alone can't route
them to the Responses API. Add a per-call use_responses_api / useResponsesApi
flag (camelCase at the scorer layer, snake_case in CachedLLMParams) so callers
can force it; the flag is stripped before the request is sent.
---
 js/llm.test.ts           | 155 +++++++++++++++++++++++++++++++++++++++
 js/llm.ts                |  14 ++++
 js/oai.ts                |  28 +++++--
 py/autoevals/llm.py      |   6 ++
 py/autoevals/oai.py      |   8 +-
 py/autoevals/test_llm.py |  49 +++++++++++++
 6 files changed, 253 insertions(+), 7 deletions(-)

diff --git a/js/llm.test.ts b/js/llm.test.ts
index 7390d44..0c9ee7d 100644
--- a/js/llm.test.ts
+++ b/js/llm.test.ts
@@ -338,6 +338,161 @@ Issue Description: {{page_content}}
     expect(capturedRequestBody.max_tokens).toBeUndefined();
   });
 
+  test("useResponsesApi forces the Responses API for a non-gpt-5 model", async () => {
+    let responsesHit = false;
+    let chatCompletionsHit = false;
+
+    server.use(
+      http.post("https://api.openai.com/v1/responses", async ({ request }) => {
+        responsesHit = true;
+        const body = (await request.json()) as any;
+        // The control flag must be stripped before reaching the API.
+        expect(body.use_responses_api).toBeUndefined();
+        expect(body.useResponsesApi).toBeUndefined();
+        return HttpResponse.json({
+          id: "resp-test",
+          object: "response",
+          created: 1234567890,
+          model: body.model,
+          output: [
+            {
+              type: "function_call",
+              call_id: "call_test",
+              name: "select_choice",
+              arguments: JSON.stringify({ choice: "1" }),
+            },
+          ],
+        });
+      }),
+      http.post(
+        "https://api.openai.com/v1/chat/completions",
+        async ({ request }) => {
+          chatCompletionsHit = true;
+          const body = (await request.json()) as any;
+          return HttpResponse.json({
+            id: "chatcmpl-test",
+            object: "chat.completion",
+            created: 1234567890,
+            model: body.model,
+            choices: [
+              {
+                index: 0,
+                message: {
+                  role: "assistant",
+                  content: null,
+                  tool_calls: [
+                    {
+                      id: "call_test",
+                      type: "function",
+                      function: {
+                        name: "select_choice",
+                        arguments: JSON.stringify({ choice: "1" }),
+                      },
+                    },
+                  ],
+                },
+                finish_reason: "stop",
+              },
+            ],
+          });
+        },
+      ),
+    );
+
+    init({
+      client: new OpenAI({
+        apiKey: "test-api-key",
+        baseURL: "https://api.openai.com/v1",
+      }),
+    });
+
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Test prompt: {{output}} vs {{expected}}",
+      choiceScores: { "1": 1, "2": 0 },
+    });
+
+    // A proxy-served model that does NOT start with "gpt-5".
+    const result = await classifier({
+      output: "test output",
+      expected: "test expected",
+      model: "internal-proxy-model",
+      useResponsesApi: true,
+    });
+
+    expect(result.error).toBeUndefined();
+    expect(responsesHit).toBe(true);
+    expect(chatCompletionsHit).toBe(false);
+  });
+
+  test("non-gpt-5 model uses Chat Completions when useResponsesApi is not set", async () => {
+    let responsesHit = false;
+    let chatCompletionsHit = false;
+
+    server.use(
+      http.post("https://api.openai.com/v1/responses", async () => {
+        responsesHit = true;
+        return HttpResponse.json({});
+      }),
+      http.post(
+        "https://api.openai.com/v1/chat/completions",
+        async ({ request }) => {
+          chatCompletionsHit = true;
+          const body = (await request.json()) as any;
+          return HttpResponse.json({
+            id: "chatcmpl-test",
+            object: "chat.completion",
+            created: 1234567890,
+            model: body.model,
+            choices: [
+              {
+                index: 0,
+                message: {
+                  role: "assistant",
+                  content: null,
+                  tool_calls: [
+                    {
+                      id: "call_test",
+                      type: "function",
+                      function: {
+                        name: "select_choice",
+                        arguments: JSON.stringify({ choice: "1" }),
+                      },
+                    },
+                  ],
+                },
+                finish_reason: "stop",
+              },
+            ],
+          });
+        },
+      ),
+    );
+
+    init({
+      client: new OpenAI({
+        apiKey: "test-api-key",
+        baseURL: "https://api.openai.com/v1",
+      }),
+    });
+
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Test prompt: {{output}} vs {{expected}}",
+      choiceScores: { "1": 1, "2": 0 },
+    });
+
+    const result = await classifier({
+      output: "test output",
+      expected: "test expected",
+      model: "gpt-4o-mini",
+    });
+
+    expect(result.error).toBeUndefined();
+    expect(chatCompletionsHit).toBe(true);
+    expect(responsesHit).toBe(false);
+  });
+
   test("LLMClassifierFromTemplate uses configured default model", async () => {
     let capturedModel: string | undefined;
 
diff --git a/js/llm.ts b/js/llm.ts
index bb50b30..2826fab 100644
--- a/js/llm.ts
+++ b/js/llm.ts
@@ -73,6 +73,12 @@ export type LLMArgs = {
   reasoningEffort?: ReasoningEffort;
   reasoningEnabled?: boolean;
   reasoningBudget?: number;
+  /**
+   * Force the request to use the Responses API, even when the model name does
+   * not start with "gpt-5". Useful for proxy/internal setups that serve a
+   * Responses-only model under a non-matching name.
+   */
+  useResponsesApi?: boolean;
 } & OpenAIAuth;
 
 /**
@@ -166,6 +172,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
     reasoningEffort,
     reasoningEnabled,
     reasoningBudget,
+    useResponsesApi,
     cache,
     ...remainingRenderArgs
   } = remaining;
@@ -176,6 +183,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
     reasoning_effort?: ReasoningEffort;
     reasoning_enabled?: boolean;
     reasoning_budget?: number;
+    use_responses_api?: boolean;
   } = {};
   if (temperature !== undefined) {
     extraArgs.temperature = temperature;
@@ -192,6 +200,9 @@ export async function OpenAIClassifier<RenderArgs, Output>(
   if (reasoningBudget !== undefined) {
     extraArgs.reasoning_budget = reasoningBudget;
   }
+  if (useResponsesApi !== undefined) {
+    extraArgs.use_responses_api = useResponsesApi;
+  }
 
   const renderArgs = {
     output,
@@ -293,6 +304,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
   reasoningEffort,
   reasoningEnabled,
   reasoningBudget,
+  useResponsesApi,
 }: {
   name: string;
   promptTemplate: string;
@@ -304,6 +316,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
   reasoningEffort?: ReasoningEffort;
   reasoningEnabled?: boolean;
   reasoningBudget?: number;
+  useResponsesApi?: boolean;
 }): Scorer<string, LLMClassifierArgs<RenderArgs>> {
   const choiceStrings = Object.keys(choiceScores);
   const ret = async (
@@ -352,6 +365,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
       reasoningEffort,
       reasoningEnabled,
       reasoningBudget,
+      useResponsesApi,
       __choices: choiceStrings,
       // Thread template vars come first so explicit args can override
       ...threadVars,
diff --git a/js/oai.ts b/js/oai.ts
index 3161467..bd29315 100644
--- a/js/oai.ts
+++ b/js/oai.ts
@@ -19,6 +19,12 @@ export interface CachedLLMParams {
   temperature?: number;
   max_tokens?: number;
   reasoning_effort?: ReasoningEffort;
+  /**
+   * Force the request to use the Responses API, even when the model name does
+   * not start with "gpt-5". Useful for proxy/internal setups that serve a
+   * Responses-only model under a name that doesn't match {@link isGPT5Model}.
+   */
+  use_responses_api?: boolean;
   span_info?: {
     spanAttributes?: Record<string, string>;
   };
@@ -295,26 +301,38 @@ function isGPT5Model(model: string): boolean {
   return model.startsWith("gpt-5");
 }
 
+/**
+ * Whether to route the request through the Responses API. GPT-5 models require
+ * it, and callers can force it via `useResponsesApi` for proxy/internal setups
+ * that serve a Responses-only model under a name that doesn't start with "gpt-5".
+ */
+function isForcedResponsesMode(params: CachedLLMParams): boolean {
+  return isGPT5Model(params.model) || params.use_responses_api === true;
+}
+
 export async function cachedChatCompletion(
   params: CachedLLMParams,
   options: { cache?: ChatCache } & OpenAIAuth,
 ): Promise<ChatCompletion> {
   const openai = buildOpenAIClient(options);
 
+  // Strip use_responses_api so it is never forwarded to either API.
+  const { use_responses_api: _useResponsesApi, ...completionParams } = params;
+
   const fullParams = globalThis.__inherited_braintrust_wrap_openai
     ? {
-        ...params,
+        ...completionParams,
         span_info: {
           spanAttributes: {
-            ...params.span_info?.spanAttributes,
+            ...completionParams.span_info?.spanAttributes,
             purpose: "scorer",
           },
         },
       }
-    : params;
+    : completionParams;
 
-  // GPT-5 models require the Responses API
-  if (isGPT5Model(params.model)) {
+  // GPT-5 models require the Responses API; callers may also force it.
+  if (isForcedResponsesMode(params)) {
     // Convert Chat Completions API params to Responses API params
     const responsesParams: any = {
       model: fullParams.model,
diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py
index e12baea..e79a45e 100644
--- a/py/autoevals/llm.py
+++ b/py/autoevals/llm.py
@@ -180,6 +180,7 @@ def __init__(
         reasoning_effort=None,
         reasoning_enabled=None,
         reasoning_budget=None,
+        use_responses_api=None,
         engine=None,
         api_key=None,
         base_url=None,
@@ -210,6 +211,9 @@ def __init__(
         if reasoning_budget is not None:
             self.extra_args["reasoning_budget"] = reasoning_budget
 
+        if use_responses_api is not None:
+            self.extra_args["use_responses_api"] = use_responses_api
+
         self.render_args = {}
         if render_args:
             self.render_args.update(render_args)
@@ -366,6 +370,7 @@ def __init__(
         reasoning_effort=None,
         reasoning_enabled=None,
         reasoning_budget=None,
+        use_responses_api=None,
         engine=None,
         api_key=None,
         base_url=None,
@@ -397,6 +402,7 @@ def __init__(
             reasoning_effort=reasoning_effort,
             reasoning_enabled=reasoning_enabled,
             reasoning_budget=reasoning_budget,
+            use_responses_api=use_responses_api,
             engine=engine,
             api_key=api_key,
             base_url=base_url,
diff --git a/py/autoevals/oai.py b/py/autoevals/oai.py
index fa21554..56b5ed2 100644
--- a/py/autoevals/oai.py
+++ b/py/autoevals/oai.py
@@ -320,7 +320,9 @@ def prepare_responses_params(kwargs: dict[str, Any]) -> dict[str, Any]:
 
                 async def complete_wrapper(**kwargs: Any) -> Any:
                     model = kwargs.get("model", "")
-                    if is_gpt5_model(model):
+                    # Strip use_responses_api so it is never forwarded to either API.
+                    use_responses_api = kwargs.pop("use_responses_api", False)
+                    if is_gpt5_model(model) or use_responses_api:
                         responses_params = prepare_responses_params(kwargs)
                         response = await responses_create(**responses_params)
                         return convert_responses_to_chat_completion(response)
@@ -330,7 +332,9 @@ async def complete_wrapper(**kwargs: Any) -> Any:
 
                 def complete_wrapper(**kwargs: Any) -> Any:
                     model = kwargs.get("model", "")
-                    if is_gpt5_model(model):
+                    # Strip use_responses_api so it is never forwarded to either API.
+                    use_responses_api = kwargs.pop("use_responses_api", False)
+                    if is_gpt5_model(model) or use_responses_api:
                         responses_params = prepare_responses_params(kwargs)
                         response = responses_create(**responses_params)
                         return convert_responses_to_chat_completion(response)
diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py
index ba97b94..b37c395 100644
--- a/py/autoevals/test_llm.py
+++ b/py/autoevals/test_llm.py
@@ -489,6 +489,55 @@ def capture_model(request):
     init(None)
 
 
+@respx.mock
+def test_use_responses_api_forces_responses_for_non_gpt5_model():
+    """use_responses_api should route a non-gpt-5 model through the Responses API."""
+    responses_route = respx.route(method="POST", path__regex=r".*/responses$").mock(
+        return_value=Response(
+            200,
+            json={
+                "id": "resp-test",
+                "object": "response",
+                "created": 1234567890,
+                "model": "internal-proxy-model",
+                "output": [
+                    {
+                        "type": "function_call",
+                        "call_id": "call_test",
+                        "name": "select_choice",
+                        "arguments": '{"choice": "1"}',
+                    }
+                ],
+            },
+        )
+    )
+    chat_route = respx.route(method="POST", path__regex=r".*/chat/completions$").mock(
+        return_value=Response(200, json={})
+    )
+
+    client = OpenAI(api_key="test-api-key", base_url="https://api.openai.com/v1")
+    init(client)
+
+    classifier = LLMClassifier(
+        name="test",
+        prompt_template="Test prompt: {{output}}",
+        choice_scores={"1": 1, "2": 0},
+        model="internal-proxy-model",
+        use_responses_api=True,
+    )
+
+    result = classifier.eval(output="test output", expected="test expected")
+
+    assert result.score == 1
+    assert responses_route.called
+    assert not chat_route.called
+    # use_responses_api must be stripped before reaching the API.
+    body = json.loads(responses_route.calls[0].request.content.decode("utf-8"))
+    assert "use_responses_api" not in body
+
+    init(None)
+
+
 @respx.mock
 def test_llm_classifier_injects_thread_vars_from_trace():
     captured_request_body = None

From ad9d5a54b028f1a2a2a54adcbc41fb356c352611 Mon Sep 17 00:00:00 2001
From: Kenny Wong <wong@coda.io>
Date: Fri, 29 May 2026 08:05:56 -0400
Subject: [PATCH 2/3] [OAI] Thread use_responses_api through built-in named
 scorers

SpecFileClassifier.__new__ has a fixed kwarg list, so Factuality(use_responses_api=True)
and the other named scorers raised TypeError. Forward the flag like the other model knobs.
---
 py/autoevals/llm.py      |  3 +++
 py/autoevals/test_llm.py | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py
index e79a45e..a027d42 100644
--- a/py/autoevals/llm.py
+++ b/py/autoevals/llm.py
@@ -504,6 +504,7 @@ def __new__(
         use_cot=None,
         max_tokens=None,
         temperature=None,
+        use_responses_api=None,
         api_key=None,
         base_url=None,
         client: Client | None = None,
@@ -519,6 +520,8 @@ def __new__(
             kwargs["max_tokens"] = max_tokens
         if temperature is not None:
             kwargs["temperature"] = temperature
+        if use_responses_api is not None:
+            kwargs["use_responses_api"] = use_responses_api
         if api_key is not None:
             kwargs["api_key"] = api_key
         if base_url is not None:
diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py
index b37c395..ddc0a9e 100644
--- a/py/autoevals/test_llm.py
+++ b/py/autoevals/test_llm.py
@@ -538,6 +538,45 @@ def test_use_responses_api_forces_responses_for_non_gpt5_model():
     init(None)
 
 
+@respx.mock
+def test_use_responses_api_on_builtin_scorer():
+    """Built-in named scorers (SpecFileClassifier) should accept use_responses_api."""
+    responses_route = respx.route(method="POST", path__regex=r".*/responses$").mock(
+        return_value=Response(
+            200,
+            json={
+                "id": "resp-test",
+                "object": "response",
+                "created": 1234567890,
+                "model": "gpt-4.1",
+                "output": [
+                    {
+                        "type": "function_call",
+                        "call_id": "call_test",
+                        "name": "select_choice",
+                        "arguments": '{"reasons": "same", "choice": "C"}',
+                    }
+                ],
+            },
+        )
+    )
+    chat_route = respx.route(method="POST", path__regex=r".*/chat/completions$").mock(
+        return_value=Response(200, json={})
+    )
+
+    init(OpenAI(api_key="test-api-key", base_url="https://api.openai.com/v1"))
+
+    result = Factuality(model="gpt-4.1", use_responses_api=True).eval(
+        output="6", expected="6", input="Add the numbers 1, 2, 3"
+    )
+
+    assert result.score == 1
+    assert responses_route.called
+    assert not chat_route.called
+
+    init(None)
+
+
 @respx.mock
 def test_llm_classifier_injects_thread_vars_from_trace():
     captured_request_body = None

From a780f0aaaccee81d9f317bedc18bc65a00f21af6 Mon Sep 17 00:00:00 2001
From: Kenny Wong <wong@coda.io>
Date: Fri, 29 May 2026 08:19:50 -0400
Subject: [PATCH 3/3] [OAI] Map reasoning_effort to reasoning.effort for the
 Responses API

The Responses API rejects a top-level reasoning_effort param ("moved to
reasoning.effort"), so reasoning calls routed to it 400'd. Nest it correctly
in both languages.
---
 js/llm.test.ts           | 4 ++++
 js/oai.ts                | 3 ++-
 py/autoevals/oai.py      | 8 +++++---
 py/autoevals/test_llm.py | 6 +++++-
 4 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/js/llm.test.ts b/js/llm.test.ts
index 0c9ee7d..2b1b18b 100644
--- a/js/llm.test.ts
+++ b/js/llm.test.ts
@@ -329,6 +329,7 @@ Issue Description: {{page_content}}
       choiceScores: { "1": 1, "2": 0 },
       maxTokens: 256,
       temperature: 0.5,
+      reasoningEffort: "medium",
     });
 
     await classifier({ output: "test output", expected: "test expected" });
@@ -336,6 +337,9 @@ Issue Description: {{page_content}}
     // Verify that temperature is in the request (max_tokens not supported by Responses API)
     expect(capturedRequestBody.temperature).toBe(0.5);
     expect(capturedRequestBody.max_tokens).toBeUndefined();
+    // The Responses API nests reasoning effort under reasoning.effort.
+    expect(capturedRequestBody.reasoning).toEqual({ effort: "medium" });
+    expect(capturedRequestBody.reasoning_effort).toBeUndefined();
   });
 
   test("useResponsesApi forces the Responses API for a non-gpt-5 model", async () => {
diff --git a/js/oai.ts b/js/oai.ts
index bd29315..1515718 100644
--- a/js/oai.ts
+++ b/js/oai.ts
@@ -380,7 +380,8 @@ export async function cachedChatCompletion(
     }
     // Note: max_tokens is not supported by Responses API
     if (fullParams.reasoning_effort) {
-      responsesParams.reasoning_effort = fullParams.reasoning_effort;
+      // The Responses API nests this under reasoning.effort, unlike Chat Completions.
+      responsesParams.reasoning = { effort: fullParams.reasoning_effort };
     }
     const response: any = await openai.responses.create(responsesParams);
 
diff --git a/py/autoevals/oai.py b/py/autoevals/oai.py
index 56b5ed2..2ccb50f 100644
--- a/py/autoevals/oai.py
+++ b/py/autoevals/oai.py
@@ -310,9 +310,11 @@ def prepare_responses_params(kwargs: dict[str, Any]) -> dict[str, Any]:
                         responses_params["tool_choice"] = "required"
 
                 # Copy supported parameters
-                for key in ["temperature", "reasoning_effort"]:
-                    if key in kwargs:
-                        responses_params[key] = kwargs[key]
+                if "temperature" in kwargs:
+                    responses_params["temperature"] = kwargs["temperature"]
+                # The Responses API nests this under reasoning.effort, unlike Chat Completions.
+                if "reasoning_effort" in kwargs:
+                    responses_params["reasoning"] = {"effort": kwargs["reasoning_effort"]}
 
                 return responses_params
 
diff --git a/py/autoevals/test_llm.py b/py/autoevals/test_llm.py
index ddc0a9e..a1c2e01 100644
--- a/py/autoevals/test_llm.py
+++ b/py/autoevals/test_llm.py
@@ -396,13 +396,14 @@ def capture_request(request):
     client = OpenAI(api_key="test-api-key", base_url="https://api.openai.com/v1")
     init(client)
 
-    # Create classifier with max_tokens and temperature specified
+    # Create classifier with max_tokens, temperature and reasoning_effort specified
     classifier = LLMClassifier(
         "test",
         "Test prompt: {{output}} vs {{expected}}",
         {"1": 1, "2": 0},
         max_tokens=256,
         temperature=0.5,
+        reasoning_effort="medium",
     )
 
     classifier.eval(output="test output", expected="test expected")
@@ -410,6 +411,9 @@ def capture_request(request):
     # Verify that temperature is in the request with correct value (max_tokens not supported by Responses API)
     assert captured_request_body["temperature"] == 0.5
     assert "max_tokens" not in captured_request_body
+    # The Responses API nests reasoning effort under reasoning.effort.
+    assert captured_request_body["reasoning"] == {"effort": "medium"}
+    assert "reasoning_effort" not in captured_request_body
 
 
 @respx.mock