Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 16 additions & 10 deletions packages/core/src/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -239,20 +239,26 @@ function openAIChatCompatForBaseUrl(
}

function supportsImageInput(wire: WireApi | undefined, modelId: string): boolean {
// Wire formats that universally support image input.
if (wire === 'anthropic' || wire === 'openai-responses' || wire === 'openai-codex-responses') {
return true;
}
const lower = modelId.toLowerCase();
return (
lower.includes('vision') ||
lower.includes('vl') ||
lower.includes('multimodal') ||
lower.includes('gpt-4o') ||
lower.includes('gpt-5') ||
lower.includes('claude-3') ||
lower.includes('claude-sonnet-4') ||
lower.includes('claude-opus-4')
);
// OpenAI family (including o-series with vision)
if (lower.includes('gpt-4o') || lower.includes('gpt-4-turbo') || lower.includes('gpt-5')) return true;
// Anthropic family
if (lower.includes('claude-3') || lower.includes('claude-sonnet-4') || lower.includes('claude-opus-4')) return true;
// Google Gemini family
if (lower.includes('gemini')) return true;
// Qwen family (most recent models are multimodal)
if (lower.includes('qwen')) return true;
// Meta Llama 4 / Llama Scout vision models
if (lower.includes('llama-4') || lower.includes('llama-3.2-vision') || lower.includes('llama-scout')) return true;
// Mistral vision-capable models
if (lower.includes('pixtral')) return true;
// Generic vision markers
if (lower.includes('vision') || lower.includes('vl') || lower.includes('multimodal')) return true;
return false;
}

const BUILTIN_PUBLIC_BASE_URLS: Record<string, string> = {
Expand Down
85 changes: 85 additions & 0 deletions packages/providers/src/index.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -858,3 +858,88 @@ describe('inferReasoning', () => {
);
});
});

describe('supportsImageInputFromModelId', () => {
// synthesizeWireModel is private; test the heuristic inline.
// The logic mirrors the production code in index.ts.
function modelInput(modelId: string, wire: string = 'openai-chat'): string[] {
const lower = modelId.toLowerCase();
const wireIsVision =
wire === 'openai-codex-responses' || wire === 'anthropic' || wire === 'openai-responses';
const modelIsVision =
lower.includes('gpt-4o') ||
lower.includes('gpt-4-turbo') ||
lower.includes('gpt-5') ||
lower.includes('claude-3') ||
lower.includes('claude-sonnet-4') ||
lower.includes('claude-opus-4') ||
lower.includes('gemini') ||
lower.includes('qwen') ||
lower.includes('llama-4') ||
lower.includes('llama-3.2-vision') ||
lower.includes('llama-scout') ||
lower.includes('pixtral') ||
lower.includes('vision') ||
lower.includes('vl') ||
lower.includes('multimodal');
return wireIsVision || modelIsVision ? ['text', 'image'] : ['text'];
}

it('recognises Qwen models as vision-capable', () => {
expect(modelInput('qwen3.6-plus')).toEqual(['text', 'image']);
expect(modelInput('qwen/qwen3-235b-a22b')).toEqual(['text', 'image']);
expect(modelInput('Qwen2.5-VL-72B-Instruct')).toEqual(['text', 'image']);
expect(modelInput('qwen2.5-72b-instruct')).toEqual(['text', 'image']);
});

it('recognises Gemini models as vision-capable', () => {
expect(modelInput('gemini-2.5-pro')).toEqual(['text', 'image']);
expect(modelInput('google/gemini-2.5-flash')).toEqual(['text', 'image']);
});

it('recognises Llama-4 and Llama Scout as vision-capable', () => {
expect(modelInput('llama-4-maverick')).toEqual(['text', 'image']);
expect(modelInput('meta-llama/llama-4-scout-17b')).toEqual(['text', 'image']);
expect(modelInput('llama-3.2-vision')).toEqual(['text', 'image']);
});

it('recognises Pixtral as vision-capable', () => {
expect(modelInput('pixtral-large')).toEqual(['text', 'image']);
expect(modelInput('mistralai/pixtral-12b')).toEqual(['text', 'image']);
});

it('recognises legacy vision/VL/multimodal tags', () => {
expect(modelInput('some-model-vision')).toEqual(['text', 'image']);
expect(modelInput('llava-v1.6-vl')).toEqual(['text', 'image']);
expect(modelInput('fuyu-multimodal')).toEqual(['text', 'image']);
});

it('still recognises original model families', () => {
expect(modelInput('gpt-4o')).toEqual(['text', 'image']);
expect(modelInput('gpt-4-turbo')).toEqual(['text', 'image']);
expect(modelInput('gpt-5.4')).toEqual(['text', 'image']);
expect(modelInput('claude-3-5-sonnet')).toEqual(['text', 'image']);
expect(modelInput('claude-sonnet-4-6')).toEqual(['text', 'image']);
expect(modelInput('claude-opus-4')).toEqual(['text', 'image']);
});

it('does not falsely recognise text-only models', () => {
expect(modelInput('deepseek-chat')).toEqual(['text']);
expect(modelInput('llama-3.1-70b-instruct')).toEqual(['text']);
expect(modelInput('mistral-small')).toEqual(['text']);
expect(modelInput('gpt-3.5-turbo')).toEqual(['text']);
});

it('auto-enables vision for anthropic wire regardless of model name', () => {
expect(modelInput('claude-3-haiku-20240307', 'anthropic')).toEqual(['text', 'image']);
});

it('auto-enables vision for openai-responses wire regardless of model name', () => {
expect(modelInput('some-unknown-model', 'openai-responses')).toEqual(['text', 'image']);
});

it('auto-enables vision for openai-codex-responses wire regardless of model name', () => {
expect(modelInput('gpt-5.5', 'openai-codex-responses')).toEqual(['text', 'image']);
});
});
);
24 changes: 23 additions & 1 deletion packages/providers/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -302,13 +302,35 @@ function openAIChatCompatForBaseUrl(
* (DeepSeek, Ollama, LiteLLM, Azure, …) route to the correct pi-ai adapter
* without being in pi-ai's model registry.
*/
function supportsImageInputFromModelId(modelId: string): boolean {
const lower = modelId.toLowerCase();
// OpenAI family (including o-series with vision)
if (lower.includes('gpt-4o') || lower.includes('gpt-4-turbo') || lower.includes('gpt-5')) return true;
// Anthropic family
if (lower.includes('claude-3') || lower.includes('claude-sonnet-4') || lower.includes('claude-opus-4')) return true;
// Google Gemini family
if (lower.includes('gemini')) return true;
// Qwen family (most recent models are multimodal)
if (lower.includes('qwen')) return true;
// Meta Llama 4 / Llama Scout vision models
if (lower.includes('llama-4') || lower.includes('llama-3.2-vision') || lower.includes('llama-scout')) return true;
// Mistral vision-capable models
if (lower.includes('pixtral')) return true;
// Generic vision markers
if (lower.includes('vision') || lower.includes('vl') || lower.includes('multimodal')) return true;
return false;
}

function synthesizeWireModel(
provider: string,
modelId: string,
wire: GenerateOptions['wire'],
baseUrl: string | undefined,
): PiModel {
const supportsImageInput = wire === 'openai-codex-responses';
const supportsImageInput = wire === 'openai-codex-responses'
|| wire === 'anthropic'
|| wire === 'openai-responses'
|| supportsImageInputFromModelId(modelId);
const api =
wire === 'anthropic'
? 'anthropic-messages'
Expand Down
Loading