Skip to content

Commit 498b7a5

Browse files
committed
fix(main): harden vision resolution
1 parent 31af7b3 commit 498b7a5

10 files changed

Lines changed: 292 additions & 31 deletions

File tree

src/main/presenter/configPresenter/index.ts

Lines changed: 50 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,15 @@ const isModelSelection = (value: unknown): value is ModelSelection => {
153153
return typeof record.providerId === 'string' && typeof record.modelId === 'string'
154154
}
155155

156+
const normalizeKnownModelId = (modelId: string): string => {
157+
const normalizedModelId = modelId.trim().toLowerCase()
158+
return normalizedModelId.replace(/^models\//, '')
159+
}
160+
161+
const normalizeKnownProviderId = (providerId: string): string =>
162+
modelCapabilities.resolveProviderId(providerId.trim().toLowerCase()) ||
163+
providerId.trim().toLowerCase()
164+
156165
export const getAnthropicModelSelectionKeysToClear = (
157166
settings: Partial<
158167
Record<
@@ -398,16 +407,20 @@ export class ConfigPresenter implements IConfigPresenter {
398407
}
399408

400409
private migrateLegacyDefaultVisionModelToBuiltinAgent(): void {
401-
const legacySelection = this.store.get('defaultVisionModel') as ModelSelection | undefined
402-
if (!legacySelection) {
410+
const legacySelection = this.store.get('defaultVisionModel') as unknown
411+
if (legacySelection === undefined) {
403412
return
404413
}
405414

406-
const providerId = legacySelection.providerId?.trim()
407-
const modelId = legacySelection.modelId?.trim()
408415
const builtinVisionModel = this.getBuiltinDeepChatConfig().visionModel
409416

410-
if (!builtinVisionModel?.providerId || !builtinVisionModel?.modelId) {
417+
if (
418+
isModelSelection(legacySelection) &&
419+
(!builtinVisionModel?.providerId || !builtinVisionModel?.modelId)
420+
) {
421+
const providerId = legacySelection.providerId.trim()
422+
const modelId = legacySelection.modelId.trim()
423+
411424
if (providerId && modelId) {
412425
this.updateBuiltinDeepChatConfig({
413426
visionModel: {
@@ -1036,6 +1049,26 @@ export class ConfigPresenter implements IConfigPresenter {
10361049
return this.providerModelHelper.getCustomModels(providerId)
10371050
}
10381051

1052+
isKnownModel(providerId: string, modelId: string): boolean {
1053+
const normalizedProviderId = normalizeKnownProviderId(providerId)
1054+
const normalizedModelId = normalizeKnownModelId(modelId)
1055+
1056+
if (!normalizedProviderId || !normalizedModelId) {
1057+
return false
1058+
}
1059+
1060+
const hasKnownModel = (models: Array<{ id: string }> | undefined): boolean =>
1061+
Array.isArray(models) &&
1062+
models.some((model) => normalizeKnownModelId(model.id) === normalizedModelId)
1063+
1064+
return (
1065+
this.hasUserModelConfig(normalizedModelId, normalizedProviderId) ||
1066+
hasKnownModel(this.getProviderModels(normalizedProviderId)) ||
1067+
hasKnownModel(this.getCustomModels(normalizedProviderId)) ||
1068+
hasKnownModel(this.getDbProviderModels(normalizedProviderId))
1069+
)
1070+
}
1071+
10391072
setCustomModels(providerId: string, models: MODEL_META[]): void {
10401073
this.providerModelHelper.setCustomModels(providerId, models)
10411074
}
@@ -1709,6 +1742,18 @@ export class ConfigPresenter implements IConfigPresenter {
17091742
)
17101743
}
17111744

1745+
async agentSupportsCapability(agentId: string, capability: 'vision'): Promise<boolean> {
1746+
if (capability !== 'vision') {
1747+
return false
1748+
}
1749+
1750+
const agentConfig = await this.resolveDeepChatAgentConfig(agentId)
1751+
const providerId = agentConfig.visionModel?.providerId?.trim()
1752+
const modelId = agentConfig.visionModel?.modelId?.trim()
1753+
1754+
return Boolean(providerId && modelId && this.getModelConfig(modelId, providerId)?.vision)
1755+
}
1756+
17121757
async createDeepChatAgent(input: CreateDeepChatAgentInput): Promise<Agent> {
17131758
const created = this.getAgentRepositoryOrThrow().createDeepChatAgent(input)
17141759
this.notifyAcpAgentsChanged()

src/main/presenter/deepchatAgentPresenter/index.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3114,10 +3114,11 @@ export class DeepChatAgentPresenter implements IAgentImplementation {
31143114
): Promise<{ providerId: string; modelId: string } | null> {
31153115
const state = this.runtimeState.get(sessionId)
31163116
const dbSession = this.sessionStore.get(sessionId)
3117+
const agentId = this.getSessionAgentId(sessionId) ?? 'deepchat'
31173118
const resolved = await resolveSessionVisionTarget({
31183119
providerId: state?.providerId ?? dbSession?.provider_id,
31193120
modelId: state?.modelId ?? dbSession?.model_id,
3120-
agentId: this.getSessionAgentId(sessionId) ?? 'deepchat',
3121+
agentId,
31213122
configPresenter: this.configPresenter,
31223123
logLabel: `screenshot:${sessionId}`
31233124
})
@@ -3126,6 +3127,14 @@ export class DeepChatAgentPresenter implements IAgentImplementation {
31263127
return null
31273128
}
31283129

3130+
if (resolved.source === 'agent-vision-model') {
3131+
const agentSupportsVision =
3132+
(await this.configPresenter.agentSupportsCapability?.(agentId, 'vision')) === true
3133+
if (!agentSupportsVision) {
3134+
return null
3135+
}
3136+
}
3137+
31293138
return {
31303139
providerId: resolved.providerId,
31313140
modelId: resolved.modelId

src/main/presenter/index.ts

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -260,24 +260,16 @@ export class Presenter implements IPresenter {
260260
return null
261261
},
262262
resolveConversationSessionInfo: async (conversationId) => {
263-
try {
264-
const session = await this.newAgentPresenter?.getSession(conversationId)
265-
if (!session) {
266-
return null
267-
}
268-
269-
return {
270-
agentId: session.agentId,
271-
providerId: session.providerId,
272-
modelId: session.modelId
273-
}
274-
} catch (error) {
275-
console.warn('[Presenter] Failed to resolve new session info:', {
276-
conversationId,
277-
error
278-
})
263+
const session = await this.newAgentPresenter?.getSession(conversationId)
264+
if (!session) {
279265
return null
280266
}
267+
268+
return {
269+
agentId: session.agentId,
270+
providerId: session.providerId,
271+
modelId: session.modelId
272+
}
281273
},
282274
getSkillPresenter: () => this.skillPresenter,
283275
getYoBrowserToolHandler: () => this.yoBrowserPresenter.toolHandler,

src/main/presenter/toolPresenter/agentTools/agentToolManager.ts

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1071,7 +1071,18 @@ export class AgentToolManager {
10711071
): Promise<string> {
10721072
const fileBuffer = await fs.promises.readFile(filePath)
10731073
const metadata = this.buildImageMetadataBlock(filePath, mimeType, fileBuffer.length)
1074-
const visionTarget = await this.resolveVisionTargetForConversation(conversationId)
1074+
let visionTarget: Awaited<ReturnType<typeof this.resolveVisionTargetForConversation>>
1075+
1076+
try {
1077+
visionTarget = await this.resolveVisionTargetForConversation(conversationId)
1078+
} catch (error) {
1079+
logger.warn('[AgentToolManager] Failed to resolve vision target for image read:', {
1080+
conversationId,
1081+
filePath,
1082+
error
1083+
})
1084+
throw error
1085+
}
10751086

10761087
if (!visionTarget) {
10771088
return `${metadata}\n\nImage analysis unavailable because neither the current session model nor the agent vision model can analyze images.`
@@ -1133,11 +1144,11 @@ export class AgentToolManager {
11331144
logLabel: `read:${conversationId}`
11341145
})
11351146
} catch (error) {
1136-
logger.warn('[AgentToolManager] Failed to resolve vision target for conversation:', {
1137-
conversationId,
1138-
error
1139-
})
1140-
return null
1147+
if (this.isConversationNotFoundError(error)) {
1148+
return null
1149+
}
1150+
1151+
throw error
11411152
}
11421153
}
11431154

src/main/presenter/vision/sessionVisionResolver.ts

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,10 @@ type SessionVisionResolverParams = {
1010
providerId?: string | null
1111
modelId?: string | null
1212
agentId?: string | null
13-
configPresenter: Pick<IConfigPresenter, 'getModelConfig' | 'resolveDeepChatAgentConfig'>
13+
configPresenter: Pick<
14+
IConfigPresenter,
15+
'getModelConfig' | 'resolveDeepChatAgentConfig' | 'isKnownModel'
16+
>
1417
logLabel?: string
1518
}
1619

@@ -19,11 +22,16 @@ export async function resolveSessionVisionTarget(
1922
): Promise<SessionVisionTarget | null> {
2023
const sessionProviderId = params.providerId?.trim()
2124
const sessionModelId = params.modelId?.trim()
25+
const sessionModelConfig =
26+
sessionProviderId && sessionModelId
27+
? params.configPresenter.getModelConfig(sessionModelId, sessionProviderId)
28+
: null
2229

2330
if (
2431
sessionProviderId &&
2532
sessionModelId &&
26-
params.configPresenter.getModelConfig(sessionModelId, sessionProviderId)?.vision
33+
params.configPresenter.isKnownModel?.(sessionProviderId, sessionModelId) === true &&
34+
sessionModelConfig?.vision
2735
) {
2836
return {
2937
providerId: sessionProviderId,

src/shared/types/presenters/legacy.presenters.d.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,7 @@ export interface IConfigPresenter {
650650
getAgentType(agentId: string): Promise<AgentType | null>
651651
getDeepChatAgentConfig(agentId: string): Promise<DeepChatAgentConfig | null>
652652
resolveDeepChatAgentConfig(agentId: string): Promise<DeepChatAgentConfig>
653+
agentSupportsCapability?(agentId: string, capability: 'vision'): Promise<boolean>
653654
createDeepChatAgent(input: CreateDeepChatAgentInput): Promise<Agent>
654655
updateDeepChatAgent(agentId: string, updates: UpdateDeepChatAgentInput): Promise<Agent | null>
655656
deleteDeepChatAgent(agentId: string): Promise<boolean>
@@ -666,6 +667,7 @@ export interface IConfigPresenter {
666667
addMcpToAgent(agentId: string, isBuiltin: boolean, mcpId: string): Promise<void>
667668
removeMcpFromAgent(agentId: string, isBuiltin: boolean, mcpId: string): Promise<void>
668669
getMcpConfHelper(): any // Used to get MCP configuration helper
670+
isKnownModel?(providerId: string, modelId: string): boolean
669671
getModelConfig(modelId: string, providerId?: string): ModelConfig
670672
setModelConfig(
671673
modelId: string,

test/main/presenter/configPresenter/anthropicProviderMigration.test.ts

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import { afterEach, describe, expect, it, vi } from 'vitest'
2+
import { eventBus } from '@/eventbus'
3+
import { CONFIG_EVENTS } from '../../../../src/main/events'
24

35
vi.mock('@/eventbus', () => ({
46
eventBus: {
@@ -32,6 +34,7 @@ vi.mock('electron', () => ({
3234
}))
3335

3436
import {
37+
ConfigPresenter,
3538
getAnthropicModelSelectionKeysToClear,
3639
normalizeAnthropicProviderForApiOnly
3740
} from '../../../../src/main/presenter/configPresenter'
@@ -150,3 +153,68 @@ describe('getAnthropicModelSelectionKeysToClear', () => {
150153
expect(keysToClear).toEqual([])
151154
})
152155
})
156+
157+
describe('migrateLegacyDefaultVisionModelToBuiltinAgent', () => {
158+
it('migrates a valid legacy vision model with trimmed ids', () => {
159+
const store = {
160+
get: vi.fn().mockReturnValue({ providerId: ' openai ', modelId: ' gpt-4o ' }),
161+
delete: vi.fn()
162+
}
163+
const updateBuiltinDeepChatConfig = vi.fn()
164+
const presenter = Object.assign(Object.create(ConfigPresenter.prototype), {
165+
store,
166+
getBuiltinDeepChatConfig: vi.fn().mockReturnValue({}),
167+
updateBuiltinDeepChatConfig
168+
})
169+
170+
expect(() =>
171+
(
172+
presenter as ConfigPresenter & {
173+
migrateLegacyDefaultVisionModelToBuiltinAgent: () => void
174+
}
175+
).migrateLegacyDefaultVisionModelToBuiltinAgent()
176+
).not.toThrow()
177+
178+
expect(updateBuiltinDeepChatConfig).toHaveBeenCalledWith({
179+
visionModel: {
180+
providerId: 'openai',
181+
modelId: 'gpt-4o'
182+
}
183+
})
184+
expect(store.delete).toHaveBeenCalledWith('defaultVisionModel')
185+
expect(eventBus.sendToMain).toHaveBeenCalledWith(
186+
CONFIG_EVENTS.SETTING_CHANGED,
187+
'defaultVisionModel',
188+
undefined
189+
)
190+
})
191+
192+
it('cleans up malformed legacy vision model without throwing', () => {
193+
const store = {
194+
get: vi.fn().mockReturnValue({ providerId: { bad: true }, modelId: 'gpt-4o' }),
195+
delete: vi.fn()
196+
}
197+
const updateBuiltinDeepChatConfig = vi.fn()
198+
const presenter = Object.assign(Object.create(ConfigPresenter.prototype), {
199+
store,
200+
getBuiltinDeepChatConfig: vi.fn().mockReturnValue({}),
201+
updateBuiltinDeepChatConfig
202+
})
203+
204+
expect(() =>
205+
(
206+
presenter as ConfigPresenter & {
207+
migrateLegacyDefaultVisionModelToBuiltinAgent: () => void
208+
}
209+
).migrateLegacyDefaultVisionModelToBuiltinAgent()
210+
).not.toThrow()
211+
212+
expect(updateBuiltinDeepChatConfig).not.toHaveBeenCalled()
213+
expect(store.delete).toHaveBeenCalledWith('defaultVisionModel')
214+
expect(eventBus.sendToMain).toHaveBeenCalledWith(
215+
CONFIG_EVENTS.SETTING_CHANGED,
216+
'defaultVisionModel',
217+
undefined
218+
)
219+
})
220+
})

test/main/presenter/deepchatAgentPresenter/deepchatAgentPresenter.test.ts

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,9 @@ function createMockConfigPresenter() {
226226
getAutoCompactionTriggerThreshold: vi.fn().mockReturnValue(80),
227227
getAutoCompactionRetainRecentPairs: vi.fn().mockReturnValue(2),
228228
getSetting: vi.fn().mockReturnValue(undefined),
229-
resolveDeepChatAgentConfig: vi.fn().mockResolvedValue({})
229+
isKnownModel: vi.fn().mockReturnValue(true),
230+
resolveDeepChatAgentConfig: vi.fn().mockResolvedValue({}),
231+
agentSupportsCapability: vi.fn().mockResolvedValue(true)
230232
} as any
231233
}
232234

@@ -2885,6 +2887,10 @@ describe('DeepChatAgentPresenter', () => {
28852887
})
28862888

28872889
expect(configPresenter.resolveDeepChatAgentConfig).toHaveBeenCalledWith('persisted-agent')
2890+
expect(configPresenter.agentSupportsCapability).toHaveBeenCalledWith(
2891+
'persisted-agent',
2892+
'vision'
2893+
)
28882894
expect(llmProvider.generateCompletionStandalone).toHaveBeenCalledWith(
28892895
'google',
28902896
expect.any(Array),
@@ -2895,6 +2901,47 @@ describe('DeepChatAgentPresenter', () => {
28952901
expect(normalized).toBe('English screenshot summary')
28962902
})
28972903

2904+
it('ignores fallback agent vision models when the agent does not support vision', async () => {
2905+
sqlitePresenter.newSessionsTable.get.mockReturnValue({
2906+
id: 's1',
2907+
agent_id: 'persisted-agent'
2908+
})
2909+
configPresenter.resolveDeepChatAgentConfig.mockResolvedValueOnce({
2910+
visionModel: { providerId: 'google', modelId: 'gemini-2.5-flash' }
2911+
})
2912+
configPresenter.agentSupportsCapability.mockResolvedValueOnce(false)
2913+
configPresenter.getModelConfig.mockImplementation((modelId: string, providerId?: string) => ({
2914+
temperature: 0.7,
2915+
maxTokens: 4096,
2916+
contextLength: 128000,
2917+
thinkingBudget: 512,
2918+
reasoningEffort: 'medium',
2919+
verbosity: 'medium',
2920+
vision: providerId === 'google' && modelId === 'gemini-2.5-flash'
2921+
}))
2922+
2923+
await agent.initSession('s1', { providerId: 'openai', modelId: 'gpt-4' })
2924+
2925+
const normalized = await (agent as any).normalizeToolResultContent({
2926+
sessionId: 's1',
2927+
toolCallId: 'tc1',
2928+
toolName: 'cdp_send',
2929+
toolArgs: '{"method":"Page.captureScreenshot"}',
2930+
content: '{"data":"YWJj"}',
2931+
isError: false
2932+
})
2933+
2934+
expect(configPresenter.resolveDeepChatAgentConfig).toHaveBeenCalledWith('persisted-agent')
2935+
expect(configPresenter.agentSupportsCapability).toHaveBeenCalledWith(
2936+
'persisted-agent',
2937+
'vision'
2938+
)
2939+
expect(llmProvider.generateCompletionStandalone).not.toHaveBeenCalled()
2940+
expect(normalized).toBe(
2941+
'Screenshot captured, but automatic English analysis is unavailable because neither the current session model nor the agent vision model can analyze images.'
2942+
)
2943+
})
2944+
28982945
it('returns a readable error when neither the current model nor the agent can analyze images', async () => {
28992946
configPresenter.resolveDeepChatAgentConfig.mockResolvedValueOnce({})
29002947

0 commit comments

Comments
 (0)