braintrustdata · Kenny Wong (wong-codaio) · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/js/llm.test.ts b/js/llm.test.ts
@@ -329,13 +329,172 @@ Issue Description: {{page_content}}
       choiceScores: { "1": 1, "2": 0 },
       maxTokens: 256,
       temperature: 0.5,
+      reasoningEffort: "medium",
     });
 
     await classifier({ output: "test output", expected: "test expected" });
 
     // Verify that temperature is in the request (max_tokens not supported by Responses API)
     expect(capturedRequestBody.temperature).toBe(0.5);
     expect(capturedRequestBody.max_tokens).toBeUndefined();
+    // The Responses API nests reasoning effort under reasoning.effort.
+    expect(capturedRequestBody.reasoning).toEqual({ effort: "medium" });
+    expect(capturedRequestBody.reasoning_effort).toBeUndefined();
+  });
+
+  test("useResponsesApi forces the Responses API for a non-gpt-5 model", async () => {
+    let responsesHit = false;
+    let chatCompletionsHit = false;
+
+    server.use(
+      http.post("https://api.openai.com/v1/responses", async ({ request }) => {
+        responsesHit = true;
+        const body = (await request.json()) as any;
+        // The control flag must be stripped before reaching the API.
+        expect(body.use_responses_api).toBeUndefined();
+        expect(body.useResponsesApi).toBeUndefined();
+        return HttpResponse.json({
+          id: "resp-test",
+          object: "response",
+          created: 1234567890,
+          model: body.model,
+          output: [
+            {
+              type: "function_call",
+              call_id: "call_test",
+              name: "select_choice",
+              arguments: JSON.stringify({ choice: "1" }),
+            },
+          ],
+        });
+      }),
+      http.post(
+        "https://api.openai.com/v1/chat/completions",
+        async ({ request }) => {
+          chatCompletionsHit = true;
+          const body = (await request.json()) as any;
+          return HttpResponse.json({
+            id: "chatcmpl-test",
+            object: "chat.completion",
+            created: 1234567890,
+            model: body.model,
+            choices: [
+              {
+                index: 0,
+                message: {
+                  role: "assistant",
+                  content: null,
+                  tool_calls: [
+                    {
+                      id: "call_test",
+                      type: "function",
+                      function: {
+                        name: "select_choice",
+                        arguments: JSON.stringify({ choice: "1" }),
+                      },
+                    },
+                  ],
+                },
+                finish_reason: "stop",
+              },
+            ],
+          });
+        },
+      ),
+    );
+
+    init({
+      client: new OpenAI({
+        apiKey: "test-api-key",
+        baseURL: "https://api.openai.com/v1",
+      }),
+    });
+
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Test prompt: {{output}} vs {{expected}}",
+      choiceScores: { "1": 1, "2": 0 },
+    });
+
+    // A proxy-served model that does NOT start with "gpt-5".
+    const result = await classifier({
+      output: "test output",
+      expected: "test expected",
+      model: "internal-proxy-model",
+      useResponsesApi: true,
+    });
+
+    expect(result.error).toBeUndefined();
+    expect(responsesHit).toBe(true);
+    expect(chatCompletionsHit).toBe(false);
+  });
+
+  test("non-gpt-5 model uses Chat Completions when useResponsesApi is not set", async () => {
+    let responsesHit = false;
+    let chatCompletionsHit = false;
+
+    server.use(
+      http.post("https://api.openai.com/v1/responses", async () => {
+        responsesHit = true;
+        return HttpResponse.json({});
+      }),
+      http.post(
+        "https://api.openai.com/v1/chat/completions",
+        async ({ request }) => {
+          chatCompletionsHit = true;
+          const body = (await request.json()) as any;
+          return HttpResponse.json({
+            id: "chatcmpl-test",
+            object: "chat.completion",
+            created: 1234567890,
+            model: body.model,
+            choices: [
+              {
+                index: 0,
+                message: {
+                  role: "assistant",
+                  content: null,
+                  tool_calls: [
+                    {
+                      id: "call_test",
+                      type: "function",
+                      function: {
+                        name: "select_choice",
+                        arguments: JSON.stringify({ choice: "1" }),
+                      },
+                    },
+                  ],
+                },
+                finish_reason: "stop",
+              },
+            ],
+          });
+        },
+      ),
+    );
+
+    init({
+      client: new OpenAI({
+        apiKey: "test-api-key",
+        baseURL: "https://api.openai.com/v1",
+      }),
+    });
+
+    const classifier = LLMClassifierFromTemplate({
+      name: "test",
+      promptTemplate: "Test prompt: {{output}} vs {{expected}}",
+      choiceScores: { "1": 1, "2": 0 },
+    });
+
+    const result = await classifier({
+      output: "test output",
+      expected: "test expected",
+      model: "gpt-4o-mini",
+    });
+
+    expect(result.error).toBeUndefined();
+    expect(chatCompletionsHit).toBe(true);
+    expect(responsesHit).toBe(false);
   });
 
   test("LLMClassifierFromTemplate uses configured default model", async () => {

diff --git a/js/llm.ts b/js/llm.ts
@@ -73,6 +73,12 @@ export type LLMArgs = {
   reasoningEffort?: ReasoningEffort;
   reasoningEnabled?: boolean;
   reasoningBudget?: number;
+  /**
+   * Force the request to use the Responses API, even when the model name does
+   * not start with "gpt-5". Useful for proxy/internal setups that serve a
+   * Responses-only model under a non-matching name.
+   */
+  useResponsesApi?: boolean;
 } & OpenAIAuth;
 
 /**
@@ -166,6 +172,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
     reasoningEffort,
     reasoningEnabled,
     reasoningBudget,
+    useResponsesApi,
     cache,
     ...remainingRenderArgs
   } = remaining;
@@ -176,6 +183,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
     reasoning_effort?: ReasoningEffort;
     reasoning_enabled?: boolean;
     reasoning_budget?: number;
+    use_responses_api?: boolean;
   } = {};
   if (temperature !== undefined) {
     extraArgs.temperature = temperature;
@@ -192,6 +200,9 @@ export async function OpenAIClassifier<RenderArgs, Output>(
   if (reasoningBudget !== undefined) {
     extraArgs.reasoning_budget = reasoningBudget;
   }
+  if (useResponsesApi !== undefined) {
+    extraArgs.use_responses_api = useResponsesApi;
+  }
 
   const renderArgs = {
     output,
@@ -293,6 +304,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
   reasoningEffort,
   reasoningEnabled,
   reasoningBudget,
+  useResponsesApi,
 }: {
   name: string;
   promptTemplate: string;
@@ -304,6 +316,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
   reasoningEffort?: ReasoningEffort;
   reasoningEnabled?: boolean;
   reasoningBudget?: number;
+  useResponsesApi?: boolean;
 }): Scorer<string, LLMClassifierArgs<RenderArgs>> {
   const choiceStrings = Object.keys(choiceScores);
   const ret = async (
@@ -352,6 +365,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
       reasoningEffort,
       reasoningEnabled,
       reasoningBudget,
+      useResponsesApi,
       __choices: choiceStrings,
       // Thread template vars come first so explicit args can override
       ...threadVars,

diff --git a/js/oai.ts b/js/oai.ts
@@ -19,6 +19,12 @@ export interface CachedLLMParams {
   temperature?: number;
   max_tokens?: number;
   reasoning_effort?: ReasoningEffort;
+  /**
+   * Force the request to use the Responses API, even when the model name does
+   * not start with "gpt-5". Useful for proxy/internal setups that serve a
+   * Responses-only model under a name that doesn't match {@link isGPT5Model}.
+   */
+  use_responses_api?: boolean;
   span_info?: {
     spanAttributes?: Record<string, string>;
   };
@@ -295,26 +301,38 @@ function isGPT5Model(model: string): boolean {
   return model.startsWith("gpt-5");
 }
 
+/**
+ * Whether to route the request through the Responses API. GPT-5 models require
+ * it, and callers can force it via `useResponsesApi` for proxy/internal setups
+ * that serve a Responses-only model under a name that doesn't start with "gpt-5".
+ */
+function isForcedResponsesMode(params: CachedLLMParams): boolean {
+  return isGPT5Model(params.model) || params.use_responses_api === true;
+}
+
 export async function cachedChatCompletion(
   params: CachedLLMParams,
   options: { cache?: ChatCache } & OpenAIAuth,
 ): Promise<ChatCompletion> {
   const openai = buildOpenAIClient(options);
 
+  // Strip use_responses_api so it is never forwarded to either API.
+  const { use_responses_api: _useResponsesApi, ...completionParams } = params;
+
   const fullParams = globalThis.__inherited_braintrust_wrap_openai
     ? {
-        ...params,
+        ...completionParams,
         span_info: {
           spanAttributes: {
-            ...params.span_info?.spanAttributes,
+            ...completionParams.span_info?.spanAttributes,
             purpose: "scorer",
           },
         },
       }
-    : params;
+    : completionParams;
 
-  // GPT-5 models require the Responses API
-  if (isGPT5Model(params.model)) {
+  // GPT-5 models require the Responses API; callers may also force it.
+  if (isForcedResponsesMode(params)) {
     // Convert Chat Completions API params to Responses API params
     const responsesParams: any = {
       model: fullParams.model,
@@ -362,7 +380,8 @@ export async function cachedChatCompletion(
     }
     // Note: max_tokens is not supported by Responses API
     if (fullParams.reasoning_effort) {
-      responsesParams.reasoning_effort = fullParams.reasoning_effort;
+      // The Responses API nests this under reasoning.effort, unlike Chat Completions.
+      responsesParams.reasoning = { effort: fullParams.reasoning_effort };
     }
     const response: any = await openai.responses.create(responsesParams);
 

diff --git a/py/autoevals/llm.py b/py/autoevals/llm.py
@@ -180,6 +180,7 @@ def __init__(
         reasoning_effort=None,
         reasoning_enabled=None,
         reasoning_budget=None,
+        use_responses_api=None,
         engine=None,
         api_key=None,
         base_url=None,
@@ -210,6 +211,9 @@ def __init__(
         if reasoning_budget is not None:
             self.extra_args["reasoning_budget"] = reasoning_budget
 
+        if use_responses_api is not None:
+            self.extra_args["use_responses_api"] = use_responses_api
+
         self.render_args = {}
         if render_args:
             self.render_args.update(render_args)
@@ -366,6 +370,7 @@ def __init__(
         reasoning_effort=None,
         reasoning_enabled=None,
         reasoning_budget=None,
+        use_responses_api=None,
         engine=None,
         api_key=None,
         base_url=None,
@@ -397,6 +402,7 @@ def __init__(
             reasoning_effort=reasoning_effort,
             reasoning_enabled=reasoning_enabled,
             reasoning_budget=reasoning_budget,
+            use_responses_api=use_responses_api,
             engine=engine,
             api_key=api_key,
             base_url=base_url,
@@ -498,6 +504,7 @@ def __new__(
         use_cot=None,
         max_tokens=None,
         temperature=None,
+        use_responses_api=None,
         api_key=None,
         base_url=None,
         client: Client | None = None,
@@ -513,6 +520,8 @@ def __new__(
             kwargs["max_tokens"] = max_tokens
         if temperature is not None:
             kwargs["temperature"] = temperature
+        if use_responses_api is not None:
+            kwargs["use_responses_api"] = use_responses_api
         if api_key is not None:
             kwargs["api_key"] = api_key
         if base_url is not None:

diff --git a/py/autoevals/oai.py b/py/autoevals/oai.py
@@ -310,17 +310,21 @@ def prepare_responses_params(kwargs: dict[str, Any]) -> dict[str, Any]:
                         responses_params["tool_choice"] = "required"
 
                 # Copy supported parameters
-                for key in ["temperature", "reasoning_effort"]:
-                    if key in kwargs:
-                        responses_params[key] = kwargs[key]
+                if "temperature" in kwargs:
+                    responses_params["temperature"] = kwargs["temperature"]
+                # The Responses API nests this under reasoning.effort, unlike Chat Completions.
+                if "reasoning_effort" in kwargs:
+                    responses_params["reasoning"] = {"effort": kwargs["reasoning_effort"]}
 
                 return responses_params
 
             if self.is_async:
 
                 async def complete_wrapper(**kwargs: Any) -> Any:
                     model = kwargs.get("model", "")
-                    if is_gpt5_model(model):
+                    # Strip use_responses_api so it is never forwarded to either API.
+                    use_responses_api = kwargs.pop("use_responses_api", False)
+                    if is_gpt5_model(model) or use_responses_api:
                         responses_params = prepare_responses_params(kwargs)
                         response = await responses_create(**responses_params)
                         return convert_responses_to_chat_completion(response)
@@ -330,7 +334,9 @@ async def complete_wrapper(**kwargs: Any) -> Any:
 
                 def complete_wrapper(**kwargs: Any) -> Any:
                     model = kwargs.get("model", "")
-                    if is_gpt5_model(model):
+                    # Strip use_responses_api so it is never forwarded to either API.
+                    use_responses_api = kwargs.pop("use_responses_api", False)
+                    if is_gpt5_model(model) or use_responses_api:
                         responses_params = prepare_responses_params(kwargs)
                         response = responses_create(**responses_params)
                         return convert_responses_to_chat_completion(response)