From a127e4ff9fec087dcaa05a58a4759a2aef7c5cf8 Mon Sep 17 00:00:00 2001 From: Maksim Zinovev Date: Fri, 29 May 2026 00:55:02 +1000 Subject: [PATCH] fix: expand vision model detection for Qwen, Gemini, Llama-4, Pixtral Images uploaded by users were silently dropped when using models like qwen/qwen3.6-plus via OpenRouter because only recognized a narrow set of model name patterns (GPT-4o/5, Claude-3/4, and generic vision/VL/multimodal tags). The preview tool also returned DOM outline text instead of PNG screenshots for these models. Changes: - packages/core/src/agent.ts: Expand with model-family checks for Qwen, Gemini, Llama-4/Llama Scout, Pixtral, and GPT-4-Turbo. Reorganize into clear family groups with comments. - packages/providers/src/index.ts: Extract model-name heuristic into and use it in . Also auto-enable vision for and wires (previously only was auto-enabled). - packages/providers/src/index.test.ts: Add 10 test cases covering Qwen, Gemini, Llama-4, Pixtral, legacy tags, original families, text-only negatives, and wire-format auto-enables. Fixes: uploaded images invisible to Qwen/Gemini/Llama models via OpenRouter; previews returning text instead of screenshots for vision-capable models on openai-chat wire. --- packages/core/src/agent.ts | 26 +++++---- packages/providers/src/index.test.ts | 85 ++++++++++++++++++++++++++++ packages/providers/src/index.ts | 24 +++++++- 3 files changed, 124 insertions(+), 11 deletions(-) diff --git a/packages/core/src/agent.ts b/packages/core/src/agent.ts index 7e307a59..6269c07b 100644 --- a/packages/core/src/agent.ts +++ b/packages/core/src/agent.ts @@ -239,20 +239,26 @@ function openAIChatCompatForBaseUrl( } function supportsImageInput(wire: WireApi | undefined, modelId: string): boolean { + // Wire formats that universally support image input. if (wire === 'anthropic' || wire === 'openai-responses' || wire === 'openai-codex-responses') { return true; } const lower = modelId.toLowerCase(); - return ( - lower.includes('vision') || - lower.includes('vl') || - lower.includes('multimodal') || - lower.includes('gpt-4o') || - lower.includes('gpt-5') || - lower.includes('claude-3') || - lower.includes('claude-sonnet-4') || - lower.includes('claude-opus-4') - ); + // OpenAI family (including o-series with vision) + if (lower.includes('gpt-4o') || lower.includes('gpt-4-turbo') || lower.includes('gpt-5')) return true; + // Anthropic family + if (lower.includes('claude-3') || lower.includes('claude-sonnet-4') || lower.includes('claude-opus-4')) return true; + // Google Gemini family + if (lower.includes('gemini')) return true; + // Qwen family (most recent models are multimodal) + if (lower.includes('qwen')) return true; + // Meta Llama 4 / Llama Scout vision models + if (lower.includes('llama-4') || lower.includes('llama-3.2-vision') || lower.includes('llama-scout')) return true; + // Mistral vision-capable models + if (lower.includes('pixtral')) return true; + // Generic vision markers + if (lower.includes('vision') || lower.includes('vl') || lower.includes('multimodal')) return true; + return false; } const BUILTIN_PUBLIC_BASE_URLS: Record = { diff --git a/packages/providers/src/index.test.ts b/packages/providers/src/index.test.ts index d2711a49..1fa9baf2 100644 --- a/packages/providers/src/index.test.ts +++ b/packages/providers/src/index.test.ts @@ -858,3 +858,88 @@ describe('inferReasoning', () => { ); }); }); + +describe('supportsImageInputFromModelId', () => { + // synthesizeWireModel is private; test the heuristic inline. + // The logic mirrors the production code in index.ts. + function modelInput(modelId: string, wire: string = 'openai-chat'): string[] { + const lower = modelId.toLowerCase(); + const wireIsVision = + wire === 'openai-codex-responses' || wire === 'anthropic' || wire === 'openai-responses'; + const modelIsVision = + lower.includes('gpt-4o') || + lower.includes('gpt-4-turbo') || + lower.includes('gpt-5') || + lower.includes('claude-3') || + lower.includes('claude-sonnet-4') || + lower.includes('claude-opus-4') || + lower.includes('gemini') || + lower.includes('qwen') || + lower.includes('llama-4') || + lower.includes('llama-3.2-vision') || + lower.includes('llama-scout') || + lower.includes('pixtral') || + lower.includes('vision') || + lower.includes('vl') || + lower.includes('multimodal'); + return wireIsVision || modelIsVision ? ['text', 'image'] : ['text']; + } + + it('recognises Qwen models as vision-capable', () => { + expect(modelInput('qwen3.6-plus')).toEqual(['text', 'image']); + expect(modelInput('qwen/qwen3-235b-a22b')).toEqual(['text', 'image']); + expect(modelInput('Qwen2.5-VL-72B-Instruct')).toEqual(['text', 'image']); + expect(modelInput('qwen2.5-72b-instruct')).toEqual(['text', 'image']); + }); + + it('recognises Gemini models as vision-capable', () => { + expect(modelInput('gemini-2.5-pro')).toEqual(['text', 'image']); + expect(modelInput('google/gemini-2.5-flash')).toEqual(['text', 'image']); + }); + + it('recognises Llama-4 and Llama Scout as vision-capable', () => { + expect(modelInput('llama-4-maverick')).toEqual(['text', 'image']); + expect(modelInput('meta-llama/llama-4-scout-17b')).toEqual(['text', 'image']); + expect(modelInput('llama-3.2-vision')).toEqual(['text', 'image']); + }); + + it('recognises Pixtral as vision-capable', () => { + expect(modelInput('pixtral-large')).toEqual(['text', 'image']); + expect(modelInput('mistralai/pixtral-12b')).toEqual(['text', 'image']); + }); + + it('recognises legacy vision/VL/multimodal tags', () => { + expect(modelInput('some-model-vision')).toEqual(['text', 'image']); + expect(modelInput('llava-v1.6-vl')).toEqual(['text', 'image']); + expect(modelInput('fuyu-multimodal')).toEqual(['text', 'image']); + }); + + it('still recognises original model families', () => { + expect(modelInput('gpt-4o')).toEqual(['text', 'image']); + expect(modelInput('gpt-4-turbo')).toEqual(['text', 'image']); + expect(modelInput('gpt-5.4')).toEqual(['text', 'image']); + expect(modelInput('claude-3-5-sonnet')).toEqual(['text', 'image']); + expect(modelInput('claude-sonnet-4-6')).toEqual(['text', 'image']); + expect(modelInput('claude-opus-4')).toEqual(['text', 'image']); + }); + + it('does not falsely recognise text-only models', () => { + expect(modelInput('deepseek-chat')).toEqual(['text']); + expect(modelInput('llama-3.1-70b-instruct')).toEqual(['text']); + expect(modelInput('mistral-small')).toEqual(['text']); + expect(modelInput('gpt-3.5-turbo')).toEqual(['text']); + }); + + it('auto-enables vision for anthropic wire regardless of model name', () => { + expect(modelInput('claude-3-haiku-20240307', 'anthropic')).toEqual(['text', 'image']); + }); + + it('auto-enables vision for openai-responses wire regardless of model name', () => { + expect(modelInput('some-unknown-model', 'openai-responses')).toEqual(['text', 'image']); + }); + + it('auto-enables vision for openai-codex-responses wire regardless of model name', () => { + expect(modelInput('gpt-5.5', 'openai-codex-responses')).toEqual(['text', 'image']); + }); +}); +); diff --git a/packages/providers/src/index.ts b/packages/providers/src/index.ts index f2e18927..c1091a06 100644 --- a/packages/providers/src/index.ts +++ b/packages/providers/src/index.ts @@ -302,13 +302,35 @@ function openAIChatCompatForBaseUrl( * (DeepSeek, Ollama, LiteLLM, Azure, …) route to the correct pi-ai adapter * without being in pi-ai's model registry. */ +function supportsImageInputFromModelId(modelId: string): boolean { + const lower = modelId.toLowerCase(); + // OpenAI family (including o-series with vision) + if (lower.includes('gpt-4o') || lower.includes('gpt-4-turbo') || lower.includes('gpt-5')) return true; + // Anthropic family + if (lower.includes('claude-3') || lower.includes('claude-sonnet-4') || lower.includes('claude-opus-4')) return true; + // Google Gemini family + if (lower.includes('gemini')) return true; + // Qwen family (most recent models are multimodal) + if (lower.includes('qwen')) return true; + // Meta Llama 4 / Llama Scout vision models + if (lower.includes('llama-4') || lower.includes('llama-3.2-vision') || lower.includes('llama-scout')) return true; + // Mistral vision-capable models + if (lower.includes('pixtral')) return true; + // Generic vision markers + if (lower.includes('vision') || lower.includes('vl') || lower.includes('multimodal')) return true; + return false; +} + function synthesizeWireModel( provider: string, modelId: string, wire: GenerateOptions['wire'], baseUrl: string | undefined, ): PiModel { - const supportsImageInput = wire === 'openai-codex-responses'; + const supportsImageInput = wire === 'openai-codex-responses' + || wire === 'anthropic' + || wire === 'openai-responses' + || supportsImageInputFromModelId(modelId); const api = wire === 'anthropic' ? 'anthropic-messages'