Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 159 additions & 0 deletions js/llm.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -329,13 +329,172 @@ Issue Description: {{page_content}}
choiceScores: { "1": 1, "2": 0 },
maxTokens: 256,
temperature: 0.5,
reasoningEffort: "medium",
});

await classifier({ output: "test output", expected: "test expected" });

// Verify that temperature is in the request (max_tokens not supported by Responses API)
expect(capturedRequestBody.temperature).toBe(0.5);
expect(capturedRequestBody.max_tokens).toBeUndefined();
// The Responses API nests reasoning effort under reasoning.effort.
expect(capturedRequestBody.reasoning).toEqual({ effort: "medium" });
expect(capturedRequestBody.reasoning_effort).toBeUndefined();
});

test("useResponsesApi forces the Responses API for a non-gpt-5 model", async () => {
let responsesHit = false;
let chatCompletionsHit = false;

server.use(
http.post("https://api.openai.com/v1/responses", async ({ request }) => {
responsesHit = true;
const body = (await request.json()) as any;
// The control flag must be stripped before reaching the API.
expect(body.use_responses_api).toBeUndefined();
expect(body.useResponsesApi).toBeUndefined();
return HttpResponse.json({
id: "resp-test",
object: "response",
created: 1234567890,
model: body.model,
output: [
{
type: "function_call",
call_id: "call_test",
name: "select_choice",
arguments: JSON.stringify({ choice: "1" }),
},
],
});
}),
http.post(
"https://api.openai.com/v1/chat/completions",
async ({ request }) => {
chatCompletionsHit = true;
const body = (await request.json()) as any;
return HttpResponse.json({
id: "chatcmpl-test",
object: "chat.completion",
created: 1234567890,
model: body.model,
choices: [
{
index: 0,
message: {
role: "assistant",
content: null,
tool_calls: [
{
id: "call_test",
type: "function",
function: {
name: "select_choice",
arguments: JSON.stringify({ choice: "1" }),
},
},
],
},
finish_reason: "stop",
},
],
});
},
),
);

init({
client: new OpenAI({
apiKey: "test-api-key",
baseURL: "https://api.openai.com/v1",
}),
});

const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Test prompt: {{output}} vs {{expected}}",
choiceScores: { "1": 1, "2": 0 },
});

// A proxy-served model that does NOT start with "gpt-5".
const result = await classifier({
output: "test output",
expected: "test expected",
model: "internal-proxy-model",
useResponsesApi: true,
});

expect(result.error).toBeUndefined();
expect(responsesHit).toBe(true);
expect(chatCompletionsHit).toBe(false);
});

test("non-gpt-5 model uses Chat Completions when useResponsesApi is not set", async () => {
let responsesHit = false;
let chatCompletionsHit = false;

server.use(
http.post("https://api.openai.com/v1/responses", async () => {
responsesHit = true;
return HttpResponse.json({});
}),
http.post(
"https://api.openai.com/v1/chat/completions",
async ({ request }) => {
chatCompletionsHit = true;
const body = (await request.json()) as any;
return HttpResponse.json({
id: "chatcmpl-test",
object: "chat.completion",
created: 1234567890,
model: body.model,
choices: [
{
index: 0,
message: {
role: "assistant",
content: null,
tool_calls: [
{
id: "call_test",
type: "function",
function: {
name: "select_choice",
arguments: JSON.stringify({ choice: "1" }),
},
},
],
},
finish_reason: "stop",
},
],
});
},
),
);

init({
client: new OpenAI({
apiKey: "test-api-key",
baseURL: "https://api.openai.com/v1",
}),
});

const classifier = LLMClassifierFromTemplate({
name: "test",
promptTemplate: "Test prompt: {{output}} vs {{expected}}",
choiceScores: { "1": 1, "2": 0 },
});

const result = await classifier({
output: "test output",
expected: "test expected",
model: "gpt-4o-mini",
});

expect(result.error).toBeUndefined();
expect(chatCompletionsHit).toBe(true);
expect(responsesHit).toBe(false);
});

test("LLMClassifierFromTemplate uses configured default model", async () => {
Expand Down
14 changes: 14 additions & 0 deletions js/llm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ export type LLMArgs = {
reasoningEffort?: ReasoningEffort;
reasoningEnabled?: boolean;
reasoningBudget?: number;
/**
* Force the request to use the Responses API, even when the model name does
* not start with "gpt-5". Useful for proxy/internal setups that serve a
* Responses-only model under a non-matching name.
*/
useResponsesApi?: boolean;
} & OpenAIAuth;

/**
Expand Down Expand Up @@ -166,6 +172,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
reasoningEffort,
reasoningEnabled,
reasoningBudget,
useResponsesApi,
cache,
...remainingRenderArgs
} = remaining;
Expand All @@ -176,6 +183,7 @@ export async function OpenAIClassifier<RenderArgs, Output>(
reasoning_effort?: ReasoningEffort;
reasoning_enabled?: boolean;
reasoning_budget?: number;
use_responses_api?: boolean;
} = {};
if (temperature !== undefined) {
extraArgs.temperature = temperature;
Expand All @@ -192,6 +200,9 @@ export async function OpenAIClassifier<RenderArgs, Output>(
if (reasoningBudget !== undefined) {
extraArgs.reasoning_budget = reasoningBudget;
}
if (useResponsesApi !== undefined) {
extraArgs.use_responses_api = useResponsesApi;
}

const renderArgs = {
output,
Expand Down Expand Up @@ -293,6 +304,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
reasoningEffort,
reasoningEnabled,
reasoningBudget,
useResponsesApi,
}: {
name: string;
promptTemplate: string;
Expand All @@ -304,6 +316,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
reasoningEffort?: ReasoningEffort;
reasoningEnabled?: boolean;
reasoningBudget?: number;
useResponsesApi?: boolean;
}): Scorer<string, LLMClassifierArgs<RenderArgs>> {
const choiceStrings = Object.keys(choiceScores);
const ret = async (
Expand Down Expand Up @@ -352,6 +365,7 @@ export function LLMClassifierFromTemplate<RenderArgs>({
reasoningEffort,
reasoningEnabled,
reasoningBudget,
useResponsesApi,
__choices: choiceStrings,
// Thread template vars come first so explicit args can override
...threadVars,
Expand Down
31 changes: 25 additions & 6 deletions js/oai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ export interface CachedLLMParams {
temperature?: number;
max_tokens?: number;
reasoning_effort?: ReasoningEffort;
/**
* Force the request to use the Responses API, even when the model name does
* not start with "gpt-5". Useful for proxy/internal setups that serve a
* Responses-only model under a name that doesn't match {@link isGPT5Model}.
*/
use_responses_api?: boolean;
span_info?: {
spanAttributes?: Record<string, string>;
};
Expand Down Expand Up @@ -295,26 +301,38 @@ function isGPT5Model(model: string): boolean {
return model.startsWith("gpt-5");
}

/**
* Whether to route the request through the Responses API. GPT-5 models require
* it, and callers can force it via `useResponsesApi` for proxy/internal setups
* that serve a Responses-only model under a name that doesn't start with "gpt-5".
*/
function isForcedResponsesMode(params: CachedLLMParams): boolean {
return isGPT5Model(params.model) || params.use_responses_api === true;
}

export async function cachedChatCompletion(
params: CachedLLMParams,
options: { cache?: ChatCache } & OpenAIAuth,
): Promise<ChatCompletion> {
const openai = buildOpenAIClient(options);

// Strip use_responses_api so it is never forwarded to either API.
const { use_responses_api: _useResponsesApi, ...completionParams } = params;

const fullParams = globalThis.__inherited_braintrust_wrap_openai
? {
...params,
...completionParams,
span_info: {
spanAttributes: {
...params.span_info?.spanAttributes,
...completionParams.span_info?.spanAttributes,
purpose: "scorer",
},
},
}
: params;
: completionParams;

// GPT-5 models require the Responses API
if (isGPT5Model(params.model)) {
// GPT-5 models require the Responses API; callers may also force it.
if (isForcedResponsesMode(params)) {
// Convert Chat Completions API params to Responses API params
const responsesParams: any = {
model: fullParams.model,
Expand Down Expand Up @@ -362,7 +380,8 @@ export async function cachedChatCompletion(
}
// Note: max_tokens is not supported by Responses API
if (fullParams.reasoning_effort) {
responsesParams.reasoning_effort = fullParams.reasoning_effort;
// The Responses API nests this under reasoning.effort, unlike Chat Completions.
responsesParams.reasoning = { effort: fullParams.reasoning_effort };
}
const response: any = await openai.responses.create(responsesParams);

Expand Down
9 changes: 9 additions & 0 deletions py/autoevals/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ def __init__(
reasoning_effort=None,
reasoning_enabled=None,
reasoning_budget=None,
use_responses_api=None,
engine=None,
api_key=None,
base_url=None,
Expand Down Expand Up @@ -210,6 +211,9 @@ def __init__(
if reasoning_budget is not None:
self.extra_args["reasoning_budget"] = reasoning_budget

if use_responses_api is not None:
self.extra_args["use_responses_api"] = use_responses_api

self.render_args = {}
if render_args:
self.render_args.update(render_args)
Expand Down Expand Up @@ -366,6 +370,7 @@ def __init__(
reasoning_effort=None,
reasoning_enabled=None,
reasoning_budget=None,
use_responses_api=None,
engine=None,
api_key=None,
base_url=None,
Expand Down Expand Up @@ -397,6 +402,7 @@ def __init__(
reasoning_effort=reasoning_effort,
reasoning_enabled=reasoning_enabled,
reasoning_budget=reasoning_budget,
use_responses_api=use_responses_api,
engine=engine,
api_key=api_key,
base_url=base_url,
Expand Down Expand Up @@ -498,6 +504,7 @@ def __new__(
use_cot=None,
max_tokens=None,
temperature=None,
use_responses_api=None,
api_key=None,
base_url=None,
client: Client | None = None,
Expand All @@ -513,6 +520,8 @@ def __new__(
kwargs["max_tokens"] = max_tokens
if temperature is not None:
kwargs["temperature"] = temperature
if use_responses_api is not None:
kwargs["use_responses_api"] = use_responses_api
if api_key is not None:
kwargs["api_key"] = api_key
if base_url is not None:
Expand Down
16 changes: 11 additions & 5 deletions py/autoevals/oai.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,17 +310,21 @@ def prepare_responses_params(kwargs: dict[str, Any]) -> dict[str, Any]:
responses_params["tool_choice"] = "required"

# Copy supported parameters
for key in ["temperature", "reasoning_effort"]:
if key in kwargs:
responses_params[key] = kwargs[key]
if "temperature" in kwargs:
responses_params["temperature"] = kwargs["temperature"]
# The Responses API nests this under reasoning.effort, unlike Chat Completions.
if "reasoning_effort" in kwargs:
responses_params["reasoning"] = {"effort": kwargs["reasoning_effort"]}

return responses_params

if self.is_async:

async def complete_wrapper(**kwargs: Any) -> Any:
model = kwargs.get("model", "")
if is_gpt5_model(model):
# Strip use_responses_api so it is never forwarded to either API.
use_responses_api = kwargs.pop("use_responses_api", False)
if is_gpt5_model(model) or use_responses_api:
responses_params = prepare_responses_params(kwargs)
response = await responses_create(**responses_params)
return convert_responses_to_chat_completion(response)
Expand All @@ -330,7 +334,9 @@ async def complete_wrapper(**kwargs: Any) -> Any:

def complete_wrapper(**kwargs: Any) -> Any:
model = kwargs.get("model", "")
if is_gpt5_model(model):
# Strip use_responses_api so it is never forwarded to either API.
use_responses_api = kwargs.pop("use_responses_api", False)
if is_gpt5_model(model) or use_responses_api:
responses_params = prepare_responses_params(kwargs)
response = responses_create(**responses_params)
return convert_responses_to_chat_completion(response)
Expand Down
Loading