From a127e4ff9fec087dcaa05a58a4759a2aef7c5cf8 Mon Sep 17 00:00:00 2001
From: Maksim Zinovev <maksimzinovevsubmit@gmail.com>
Date: Fri, 29 May 2026 00:55:02 +1000
Subject: [PATCH] fix: expand vision model detection for Qwen, Gemini, Llama-4,
 Pixtral

Images uploaded by users were silently dropped when using models like
qwen/qwen3.6-plus via OpenRouter because  only
recognized a narrow set of model name patterns (GPT-4o/5, Claude-3/4,
and generic vision/VL/multimodal tags). The preview tool also returned
DOM outline text instead of PNG screenshots for these models.

Changes:

- packages/core/src/agent.ts: Expand  with
  model-family checks for Qwen, Gemini, Llama-4/Llama Scout, Pixtral,
  and GPT-4-Turbo. Reorganize into clear family groups with comments.

- packages/providers/src/index.ts: Extract model-name heuristic into
   and use it in .
  Also auto-enable vision for  and  wires
  (previously only  was auto-enabled).

- packages/providers/src/index.test.ts: Add 10 test cases covering
  Qwen, Gemini, Llama-4, Pixtral, legacy tags, original families,
  text-only negatives, and wire-format auto-enables.

Fixes: uploaded images invisible to Qwen/Gemini/Llama models via
OpenRouter; previews returning text instead of screenshots for
vision-capable models on openai-chat wire.
---
 packages/core/src/agent.ts           | 26 +++++----
 packages/providers/src/index.test.ts | 85 ++++++++++++++++++++++++++++
 packages/providers/src/index.ts      | 24 +++++++-
 3 files changed, 124 insertions(+), 11 deletions(-)

diff --git a/packages/core/src/agent.ts b/packages/core/src/agent.ts
index 7e307a59..6269c07b 100644
--- a/packages/core/src/agent.ts
+++ b/packages/core/src/agent.ts
@@ -239,20 +239,26 @@ function openAIChatCompatForBaseUrl(
 }
 
 function supportsImageInput(wire: WireApi | undefined, modelId: string): boolean {
+  // Wire formats that universally support image input.
   if (wire === 'anthropic' || wire === 'openai-responses' || wire === 'openai-codex-responses') {
     return true;
   }
   const lower = modelId.toLowerCase();
-  return (
-    lower.includes('vision') ||
-    lower.includes('vl') ||
-    lower.includes('multimodal') ||
-    lower.includes('gpt-4o') ||
-    lower.includes('gpt-5') ||
-    lower.includes('claude-3') ||
-    lower.includes('claude-sonnet-4') ||
-    lower.includes('claude-opus-4')
-  );
+  // OpenAI family (including o-series with vision)
+  if (lower.includes('gpt-4o') || lower.includes('gpt-4-turbo') || lower.includes('gpt-5')) return true;
+  // Anthropic family
+  if (lower.includes('claude-3') || lower.includes('claude-sonnet-4') || lower.includes('claude-opus-4')) return true;
+  // Google Gemini family
+  if (lower.includes('gemini')) return true;
+  // Qwen family (most recent models are multimodal)
+  if (lower.includes('qwen')) return true;
+  // Meta Llama 4 / Llama Scout vision models
+  if (lower.includes('llama-4') || lower.includes('llama-3.2-vision') || lower.includes('llama-scout')) return true;
+  // Mistral vision-capable models
+  if (lower.includes('pixtral')) return true;
+  // Generic vision markers
+  if (lower.includes('vision') || lower.includes('vl') || lower.includes('multimodal')) return true;
+  return false;
 }
 
 const BUILTIN_PUBLIC_BASE_URLS: Record<string, string> = {
diff --git a/packages/providers/src/index.test.ts b/packages/providers/src/index.test.ts
index d2711a49..1fa9baf2 100644
--- a/packages/providers/src/index.test.ts
+++ b/packages/providers/src/index.test.ts
@@ -858,3 +858,88 @@ describe('inferReasoning', () => {
     );
   });
 });
+
+describe('supportsImageInputFromModelId', () => {
+  // synthesizeWireModel is private; test the heuristic inline.
+  // The logic mirrors the production code in index.ts.
+  function modelInput(modelId: string, wire: string = 'openai-chat'): string[] {
+    const lower = modelId.toLowerCase();
+    const wireIsVision =
+      wire === 'openai-codex-responses' || wire === 'anthropic' || wire === 'openai-responses';
+    const modelIsVision =
+      lower.includes('gpt-4o') ||
+      lower.includes('gpt-4-turbo') ||
+      lower.includes('gpt-5') ||
+      lower.includes('claude-3') ||
+      lower.includes('claude-sonnet-4') ||
+      lower.includes('claude-opus-4') ||
+      lower.includes('gemini') ||
+      lower.includes('qwen') ||
+      lower.includes('llama-4') ||
+      lower.includes('llama-3.2-vision') ||
+      lower.includes('llama-scout') ||
+      lower.includes('pixtral') ||
+      lower.includes('vision') ||
+      lower.includes('vl') ||
+      lower.includes('multimodal');
+    return wireIsVision || modelIsVision ? ['text', 'image'] : ['text'];
+  }
+
+  it('recognises Qwen models as vision-capable', () => {
+    expect(modelInput('qwen3.6-plus')).toEqual(['text', 'image']);
+    expect(modelInput('qwen/qwen3-235b-a22b')).toEqual(['text', 'image']);
+    expect(modelInput('Qwen2.5-VL-72B-Instruct')).toEqual(['text', 'image']);
+    expect(modelInput('qwen2.5-72b-instruct')).toEqual(['text', 'image']);
+  });
+
+  it('recognises Gemini models as vision-capable', () => {
+    expect(modelInput('gemini-2.5-pro')).toEqual(['text', 'image']);
+    expect(modelInput('google/gemini-2.5-flash')).toEqual(['text', 'image']);
+  });
+
+  it('recognises Llama-4 and Llama Scout as vision-capable', () => {
+    expect(modelInput('llama-4-maverick')).toEqual(['text', 'image']);
+    expect(modelInput('meta-llama/llama-4-scout-17b')).toEqual(['text', 'image']);
+    expect(modelInput('llama-3.2-vision')).toEqual(['text', 'image']);
+  });
+
+  it('recognises Pixtral as vision-capable', () => {
+    expect(modelInput('pixtral-large')).toEqual(['text', 'image']);
+    expect(modelInput('mistralai/pixtral-12b')).toEqual(['text', 'image']);
+  });
+
+  it('recognises legacy vision/VL/multimodal tags', () => {
+    expect(modelInput('some-model-vision')).toEqual(['text', 'image']);
+    expect(modelInput('llava-v1.6-vl')).toEqual(['text', 'image']);
+    expect(modelInput('fuyu-multimodal')).toEqual(['text', 'image']);
+  });
+
+  it('still recognises original model families', () => {
+    expect(modelInput('gpt-4o')).toEqual(['text', 'image']);
+    expect(modelInput('gpt-4-turbo')).toEqual(['text', 'image']);
+    expect(modelInput('gpt-5.4')).toEqual(['text', 'image']);
+    expect(modelInput('claude-3-5-sonnet')).toEqual(['text', 'image']);
+    expect(modelInput('claude-sonnet-4-6')).toEqual(['text', 'image']);
+    expect(modelInput('claude-opus-4')).toEqual(['text', 'image']);
+  });
+
+  it('does not falsely recognise text-only models', () => {
+    expect(modelInput('deepseek-chat')).toEqual(['text']);
+    expect(modelInput('llama-3.1-70b-instruct')).toEqual(['text']);
+    expect(modelInput('mistral-small')).toEqual(['text']);
+    expect(modelInput('gpt-3.5-turbo')).toEqual(['text']);
+  });
+
+  it('auto-enables vision for anthropic wire regardless of model name', () => {
+    expect(modelInput('claude-3-haiku-20240307', 'anthropic')).toEqual(['text', 'image']);
+  });
+
+  it('auto-enables vision for openai-responses wire regardless of model name', () => {
+    expect(modelInput('some-unknown-model', 'openai-responses')).toEqual(['text', 'image']);
+  });
+
+  it('auto-enables vision for openai-codex-responses wire regardless of model name', () => {
+    expect(modelInput('gpt-5.5', 'openai-codex-responses')).toEqual(['text', 'image']);
+  });
+});
+);
diff --git a/packages/providers/src/index.ts b/packages/providers/src/index.ts
index f2e18927..c1091a06 100644
--- a/packages/providers/src/index.ts
+++ b/packages/providers/src/index.ts
@@ -302,13 +302,35 @@ function openAIChatCompatForBaseUrl(
  * (DeepSeek, Ollama, LiteLLM, Azure, …) route to the correct pi-ai adapter
  * without being in pi-ai's model registry.
  */
+function supportsImageInputFromModelId(modelId: string): boolean {
+  const lower = modelId.toLowerCase();
+  // OpenAI family (including o-series with vision)
+  if (lower.includes('gpt-4o') || lower.includes('gpt-4-turbo') || lower.includes('gpt-5')) return true;
+  // Anthropic family
+  if (lower.includes('claude-3') || lower.includes('claude-sonnet-4') || lower.includes('claude-opus-4')) return true;
+  // Google Gemini family
+  if (lower.includes('gemini')) return true;
+  // Qwen family (most recent models are multimodal)
+  if (lower.includes('qwen')) return true;
+  // Meta Llama 4 / Llama Scout vision models
+  if (lower.includes('llama-4') || lower.includes('llama-3.2-vision') || lower.includes('llama-scout')) return true;
+  // Mistral vision-capable models
+  if (lower.includes('pixtral')) return true;
+  // Generic vision markers
+  if (lower.includes('vision') || lower.includes('vl') || lower.includes('multimodal')) return true;
+  return false;
+}
+
 function synthesizeWireModel(
   provider: string,
   modelId: string,
   wire: GenerateOptions['wire'],
   baseUrl: string | undefined,
 ): PiModel {
-  const supportsImageInput = wire === 'openai-codex-responses';
+  const supportsImageInput = wire === 'openai-codex-responses'
+    || wire === 'anthropic'
+    || wire === 'openai-responses'
+    || supportsImageInputFromModelId(modelId);
   const api =
     wire === 'anthropic'
       ? 'anthropic-messages'