From 545b5d020733817e3e12214a6df7b7a9716911f0 Mon Sep 17 00:00:00 2001 From: 7Sageer <7sageer@djwcb.cn> Date: Thu, 11 Jun 2026 17:02:51 +0800 Subject: [PATCH] feat: merge media reads into Read tool --- .changeset/merge-read-media-into-read.md | 5 + .../messages/tool-renderers/chip.ts | 10 +- .../messages/tool-renderers/media.ts | 41 ++- .../messages/tool-renderers/registry.ts | 6 +- .../src/tui/utils/image-attachment-store.ts | 2 +- .../src/tui/utils/image-placeholder.ts | 2 +- .../messages/tool-renderers/media.test.ts | 45 ++- docs/en/reference/tools.md | 7 +- docs/zh/reference/tools.md | 7 +- packages/acp-adapter/src/kaos-acp.ts | 4 +- .../policies/default-tool-approve.ts | 1 - packages/agent-core/src/agent/tool/index.ts | 4 +- packages/agent-core/src/mcp/output.ts | 2 +- .../agent-core/src/profile/default/agent.yaml | 1 - .../agent-core/src/profile/default/coder.yaml | 1 - .../src/profile/default/explore.yaml | 1 - .../agent-core/src/profile/default/plan.yaml | 1 - .../src/tools/builtin/file/read-media.md | 13 - .../src/tools/builtin/file/read-media.ts | 273 ------------------ .../agent-core/src/tools/builtin/file/read.md | 17 +- .../agent-core/src/tools/builtin/file/read.ts | 248 ++++++++++++++-- .../agent-core/src/tools/builtin/index.ts | 1 - .../agent-core/test/agent/permission.test.ts | 10 - packages/agent-core/test/agent/turn.test.ts | 6 +- .../test/prompt-placeholders.test.ts | 1 - .../test/tools/builtin-current.test.ts | 3 +- .../test/tools/fixtures/fake-kaos.ts | 15 + .../agent-core/test/tools/read-file.test.ts | 3 +- .../test/tools/read-media-desc.test.ts | 46 --- .../agent-core/test/tools/read-media.test.ts | 268 +++++++++++------ packages/agent-core/test/tools/read.test.ts | 68 +++-- 31 files changed, 582 insertions(+), 530 deletions(-) create mode 100644 .changeset/merge-read-media-into-read.md delete mode 100644 packages/agent-core/src/tools/builtin/file/read-media.md delete mode 100644 packages/agent-core/src/tools/builtin/file/read-media.ts delete mode 100644 packages/agent-core/test/tools/read-media-desc.test.ts diff --git a/.changeset/merge-read-media-into-read.md b/.changeset/merge-read-media-into-read.md new file mode 100644 index 000000000..c5f021cab --- /dev/null +++ b/.changeset/merge-read-media-into-read.md @@ -0,0 +1,5 @@ +--- +"@moonshot-ai/kimi-code": minor +--- + +Merge media reading into the Read tool: image and video files are now returned as multimodal content directly, replacing the separate ReadMediaFile tool. diff --git a/apps/kimi-code/src/tui/components/messages/tool-renderers/chip.ts b/apps/kimi-code/src/tui/components/messages/tool-renderers/chip.ts index c7c8120f2..5aad37568 100644 --- a/apps/kimi-code/src/tui/components/messages/tool-renderers/chip.ts +++ b/apps/kimi-code/src/tui/components/messages/tool-renderers/chip.ts @@ -83,8 +83,13 @@ const editChip: ChipProvider = (toolCall) => { const writeChip: ChipProvider = (toolCall) => formatWriteChip(computeWriteStats(toolCall.args)); -const readChip: ChipProvider = (_toolCall, result) => - pluralize(countNonEmptyLines(result.output), 'line'); +const readChip: ChipProvider = (toolCall, result) => { + // Media reads carry a content-part envelope; readMediaChip returns '' + // for anything else, falling back to the text line count. + const media = readMediaChip(toolCall, result); + if (media !== '') return media; + return pluralize(countNonEmptyLines(result.output), 'line'); +}; const grepChip: ChipProvider = (_toolCall, result) => { const matches = countNonEmptyLines(result.output); @@ -118,6 +123,7 @@ const REGISTRY: Record = { Edit: editChip, Write: writeChip, Read: readChip, + // Pre-merge media tool — kept so recorded sessions still render. ReadMediaFile: readMediaChip, Grep: grepChip, Glob: globChip, diff --git a/apps/kimi-code/src/tui/components/messages/tool-renderers/media.ts b/apps/kimi-code/src/tui/components/messages/tool-renderers/media.ts index fd753cd27..af29ead42 100644 --- a/apps/kimi-code/src/tui/components/messages/tool-renderers/media.ts +++ b/apps/kimi-code/src/tui/components/messages/tool-renderers/media.ts @@ -1,12 +1,16 @@ /** - * ReadMediaFile renderer. + * Media result renderer. * - * The ReadMediaFile tool `output` is the JSON-serialized array of - * content parts the tool returned — which includes the full base64 of - * the image/video. Dumping that string into the transcript blasts a - * multi-screen blob of base64. This renderer parses the envelope and - * surfaces just the human-readable bits (kind, path, mime, size) via - * a header chip + a tiny expanded body. It never emits the base64. + * When Read hits an image or video, the tool `output` is the + * JSON-serialized array of content parts it returned — which includes + * the full base64 of the media. Dumping that string into the transcript + * blasts a multi-screen blob of base64. This renderer parses the + * envelope and surfaces just the human-readable bits (kind, path, mime, + * size) via a header chip + a tiny expanded body. It never emits the + * base64. Text reads fall through to the regular read summary. + * + * `ReadMediaFile` (the pre-merge media tool) keeps its registry entry so + * sessions recorded before the merge still render. * * On error, or when the output isn't the expected media envelope, we * fall back to the truncated renderer so the user still sees the raw @@ -18,6 +22,7 @@ import { Text } from '@earendil-works/pi-tui'; import chalk from 'chalk'; import type { ChipProvider } from './chip'; +import { readSummary } from './summary'; import { renderTruncated } from './truncated'; import type { ResultRenderer } from './types'; @@ -31,7 +36,8 @@ export interface ReadMediaSummary { } const PATH_TAG_RE = /^<(image|video)\s+path="([^"]+)">$/; -const ORIGINAL_SIZE_RE = /original size\s+(\d+x\d+px)/; +const ORIGINAL_SIZE_RE = + /original size\s+(\d+x\d+px)|original dimensions:\s+(\d+)x(\d+)\s+pixels?/i; const DATA_URL_RE = /^data:([^;]+);base64,(.*)$/s; function bytesFromBase64(b64: string): number { @@ -72,7 +78,13 @@ export function parseReadMediaOutput(output: string): ReadMediaSummary | null { continue; } const size = ORIGINAL_SIZE_RE.exec(text); - if (size) originalSize = size[1]; + if (size) { + if (size[1] !== undefined) { + originalSize = size[1]; + } else if (size[2] !== undefined && size[3] !== undefined) { + originalSize = `${size[2]}x${size[3]}px`; + } + } continue; } @@ -150,3 +162,14 @@ export const readMediaSummary: ResultRenderer = (toolCall, result, ctx) => { out.push(new Text(` ${dim(tail.join(' · '))}`, 0, 0)); return out; }; + +/** + * Read renders by content: a media envelope gets the media summary, + * anything else (numbered text lines, errors) the regular read summary. + */ +export const readOrMediaSummary: ResultRenderer = (toolCall, result, ctx) => { + if (!result.is_error && parseReadMediaOutput(result.output) !== null) { + return readMediaSummary(toolCall, result, ctx); + } + return readSummary(toolCall, result, ctx); +}; diff --git a/apps/kimi-code/src/tui/components/messages/tool-renderers/registry.ts b/apps/kimi-code/src/tui/components/messages/tool-renderers/registry.ts index 2a7b39539..931081c9c 100644 --- a/apps/kimi-code/src/tui/components/messages/tool-renderers/registry.ts +++ b/apps/kimi-code/src/tui/components/messages/tool-renderers/registry.ts @@ -10,7 +10,7 @@ * choose, so adding a new tool means appending one case. */ -import { readMediaSummary } from './media'; +import { readMediaSummary, readOrMediaSummary } from './media'; import { shellExecutionResultRenderer } from '../shell-execution'; import { goalSummary } from './goal'; import { @@ -18,7 +18,6 @@ import { fetchSummary, globSummary, grepSummary, - readSummary, thinkSummary, webSearchSummary, writeSummary, @@ -39,7 +38,8 @@ export function isGenericToolResult(toolName: string): boolean { export function pickResultRenderer(toolName: string): ResultRenderer { switch (toolName) { case 'Read': - return readSummary; + return readOrMediaSummary; + // Pre-merge media tool — kept so recorded sessions still render. case 'ReadMediaFile': return readMediaSummary; case 'Grep': diff --git a/apps/kimi-code/src/tui/utils/image-attachment-store.ts b/apps/kimi-code/src/tui/utils/image-attachment-store.ts index bac4ab8fb..a169b667c 100644 --- a/apps/kimi-code/src/tui/utils/image-attachment-store.ts +++ b/apps/kimi-code/src/tui/utils/image-attachment-store.ts @@ -6,7 +6,7 @@ * (640×480)]` / `[video #2 sample.mov]`). The placeholder is what the * user sees in the input field; on submit, `extractMediaAttachments` * walks the text and expands image placeholders to image content parts - * and video placeholders to file-path tags for `ReadMediaFile`. + * and video placeholders to file-path tags for `Read`. * * Scope is per-`KimiTUI` instance. Reloads (`/new`, `/clear`, * session switch) call `clear()` so ids restart from 1 and stale diff --git a/apps/kimi-code/src/tui/utils/image-placeholder.ts b/apps/kimi-code/src/tui/utils/image-placeholder.ts index 11c401f2f..977001035 100644 --- a/apps/kimi-code/src/tui/utils/image-placeholder.ts +++ b/apps/kimi-code/src/tui/utils/image-placeholder.ts @@ -9,7 +9,7 @@ * - Order is preserved for text/image/video segments. Image placeholders * expand to image content parts so the prompt reaches the provider * without relying on a model tool call. Video placeholders still expand - * to file-path tags so `ReadMediaFile` can own video upload behavior. + * to file-path tags so `Read` can own video upload behavior. * - Adjacent text segments are flattened — empty / whitespace-only * segments drop out so we never emit `{type:'text', text:' '}` * noise between two media parts. diff --git a/apps/kimi-code/test/tui/components/messages/tool-renderers/media.test.ts b/apps/kimi-code/test/tui/components/messages/tool-renderers/media.test.ts index ab4a53fd6..9da515eba 100644 --- a/apps/kimi-code/test/tui/components/messages/tool-renderers/media.test.ts +++ b/apps/kimi-code/test/tui/components/messages/tool-renderers/media.test.ts @@ -5,7 +5,9 @@ import { parseReadMediaOutput, readMediaChip, readMediaSummary, + readOrMediaSummary, } from '#/tui/components/messages/tool-renderers/media'; +import { pickChip } from '#/tui/components/messages/tool-renderers/chip'; import { darkColors } from '#/tui/theme/colors'; import type { ToolCallBlockData, ToolResultBlockData } from '#/tui/types'; @@ -35,10 +37,17 @@ const PNG_DATA_URL = `data:image/png;base64,${PNG_B64}`; function imageOutput(path: string, b64 = PNG_B64, mime = 'image/png'): string { return JSON.stringify([ + { + type: 'text', + text: + `Read image file. Mime type: ${mime}. Size: 70 bytes. ` + + 'Original dimensions: 1x1 pixels. If you need to output coordinates, ' + + 'output relative coordinates first and compute absolute coordinates using the original image size. ' + + 'If you generate or edit images or videos via commands or scripts, read the result back immediately before continuing.', + }, { type: 'text', text: `` }, { type: 'image_url', imageUrl: { url: `data:${mime};base64,${b64}` } }, { type: 'text', text: '' }, - { type: 'text', text: `Loaded image file "${path}" (${mime}, 70 bytes, original size 1x1px).` }, ]); } @@ -121,6 +130,7 @@ describe('readMediaSummary renderer', () => { ); expect(out).toContain('/tmp/a.png'); expect(out).toContain('image/png'); + expect(out).toContain('1x1px'); // Crucially: the base64 must never reach the screen. expect(out).not.toContain(PNG_B64); expect(out).not.toContain(PNG_DATA_URL); @@ -148,3 +158,36 @@ describe('readMediaSummary renderer', () => { expect(out).toContain('some plain string output'); }); }); + +describe('Read content dispatch', () => { + it('routes a media envelope to the media summary', () => { + const out = strip( + joinRender(readOrMediaSummary(call('Read'), result(imageOutput('/tmp/a.png')), expandedCtx)), + ); + expect(out).toContain('/tmp/a.png'); + expect(out).toContain('image/png'); + expect(out).not.toContain(PNG_B64); + }); + + it('routes numbered text lines to the regular read summary (empty collapsed body)', () => { + const out = joinRender(readOrMediaSummary(call('Read'), result('1\thello\n2\tworld'), ctx)); + expect(out.trim()).toBe(''); + }); + + it('Read chip shows media meta for media reads and line counts for text reads', () => { + const chip = pickChip('Read'); + expect(chip).toBeDefined(); + const mediaText = strip(chip!(call('Read'), result(imageOutput('/tmp/a.png')))); + expect(mediaText).toMatch(/image/); + expect(mediaText).toContain('image/png'); + const textText = strip(chip!(call('Read'), result('1\thello\n2\tworld'))); + expect(textText).toBe('2 lines'); + }); + + it('keeps the legacy ReadMediaFile chip entry for recorded sessions', () => { + const chip = pickChip('ReadMediaFile'); + expect(chip).toBeDefined(); + const text = strip(chip!(call('ReadMediaFile'), result(imageOutput('/tmp/a.png')))); + expect(text).toMatch(/image/); + }); +}); diff --git a/docs/en/reference/tools.md b/docs/en/reference/tools.md index 0856ff19e..33bf259e3 100644 --- a/docs/en/reference/tools.md +++ b/docs/en/reference/tools.md @@ -10,14 +10,13 @@ File tools handle reading, writing, and searching the local filesystem — the f | Tool | Default Approval | Description | | --- | --- | --- | -| `Read` | Auto-allow | Read a text file's contents | +| `Read` | Auto-allow | Read a text, image, or video file | | `Write` | Requires approval | Create or overwrite a file | | `Edit` | Requires approval | Precise string replacement | | `Grep` | Auto-allow | Full-text search powered by ripgrep | | `Glob` | Auto-allow | Find files by glob pattern | -| `ReadMediaFile` | Auto-allow | Read an image or video file | -**`Read`** accepts a file path (`path`) plus optional `line_offset` (starting line number; negative values count from the end) and `n_lines` (maximum number of lines to read). Returns at most 1000 lines or 100 KB per call; content beyond that limit is accompanied by a truncation notice. If the file is an image or video, the tool suggests using `ReadMediaFile` instead. +**`Read`** accepts a file path (`path`) plus optional `line_offset` (starting line number; negative values count from the end) and `n_lines` (maximum number of lines to read). The file kind is detected by extension and magic bytes: text files return at most 1000 lines or 100 KB per call, with a truncation notice beyond that limit; images and videos are sent to the model as multimodal content (`line_offset` / `n_lines` are ignored for them) with a 100 MB size limit, subject to the current model's vision capabilities (`image_in` / `video_in`). **`Write`** accepts `path`, `content`, and an optional `mode` (`overwrite` or `append`; defaults to overwrite). The parent directory must already exist; `append` mode appends content to the end of the file without automatically adding a newline. @@ -27,8 +26,6 @@ File tools handle reading, writing, and searching the local filesystem — the f **`Glob`** matches files in a specified directory (`path`; defaults to the working directory) by glob pattern (`pattern`). Results are sorted by modification time in descending order, with a maximum of 1000 entries. Pure wildcard patterns (e.g., `**`) and patterns containing brace expansion (`{a,b,c}`) are rejected. -**`ReadMediaFile`** sends an image or video to the model as multimodal content. Accepts only `path`; the file size limit is 100 MB. Availability depends on the current model's vision capabilities (`image_in` / `video_in`). - ## Shell | Tool | Default Approval | Description | diff --git a/docs/zh/reference/tools.md b/docs/zh/reference/tools.md index 151ee273c..181a874d6 100644 --- a/docs/zh/reference/tools.md +++ b/docs/zh/reference/tools.md @@ -10,14 +10,13 @@ | 工具 | 默认审批 | 说明 | | --- | --- | --- | -| `Read` | 自动放行 | 读取文本文件内容 | +| `Read` | 自动放行 | 读取文本、图片或视频文件 | | `Write` | 需审批 | 创建或覆盖文件 | | `Edit` | 需审批 | 精确字符串替换 | | `Grep` | 自动放行 | 基于 ripgrep 的全文搜索 | | `Glob` | 自动放行 | 按 glob 模式查找文件 | -| `ReadMediaFile` | 自动放行 | 读取图片或视频文件 | -**`Read`** 接受文件路径(`path`)以及可选的 `line_offset`(起始行号,支持负数从末尾倒数)和 `n_lines`(读取行数上限)。单次最多返回 1000 行或 100 KB,超出部分会附带截断提示。如果文件是图片或视频,工具会提示改用 `ReadMediaFile`。 +**`Read`** 接受文件路径(`path`)以及可选的 `line_offset`(起始行号,支持负数从末尾倒数)和 `n_lines`(读取行数上限)。文件类型由扩展名和魔数自动识别:文本文件单次最多返回 1000 行或 100 KB,超出部分会附带截断提示;图片和视频以多模态内容发送给模型(`line_offset` / `n_lines` 对其无效),文件大小上限 100 MB,是否支持取决于当前模型的视觉能力(`image_in` / `video_in`)。 **`Write`** 接受 `path`、`content` 和可选的 `mode`(`overwrite` 或 `append`,默认覆盖)。父目录必须已存在;`append` 模式将内容追加到文件末尾,不自动添加换行。 @@ -27,8 +26,6 @@ **`Glob`** 按 glob 模式(`pattern`)在指定目录(`path`,默认工作目录)中匹配文件,结果按修改时间倒序排列,最多返回 1000 条。纯通配符模式(如 `**`)和含花括号扩展(`{a,b,c}`)的模式会被拒绝。 -**`ReadMediaFile`** 将图片或视频以多模态内容发送给模型,仅接受 `path`,文件大小上限 100 MB。是否可用取决于当前模型的视觉能力(`image_in` / `video_in`)。 - ## Shell | 工具 | 默认审批 | 说明 | diff --git a/packages/acp-adapter/src/kaos-acp.ts b/packages/acp-adapter/src/kaos-acp.ts index 14337d64d..9de84f304 100644 --- a/packages/acp-adapter/src/kaos-acp.ts +++ b/packages/acp-adapter/src/kaos-acp.ts @@ -133,7 +133,7 @@ export class AcpKaos implements Kaos { /** * Binary reads bypass the ACP text RPC by design: `fs/readTextFile` * returns a decoded string and would corrupt or reject non-UTF-8 - * payloads (images, video, archives — anything `ReadMediaFile` may + * payloads (images, video, archives — anything the media read path may * touch). The ACP bridge only owns the *text* surface; raw bytes * stay on the local filesystem via `inner`. */ @@ -144,7 +144,7 @@ export class AcpKaos implements Kaos { /** * Return a small UTF-8 header derived from the same ACP text source as * `readText` / `readLines`, used only by text-read callers for sniffing. - * Keep `readBytes` local so binary callers such as ReadMediaFile stay safe. + * Keep `readBytes` local so binary callers such as Read's media path stay safe. */ async readTextPreview(path: string, n: number): Promise { const text = await this.readText(path); diff --git a/packages/agent-core/src/agent/permission/policies/default-tool-approve.ts b/packages/agent-core/src/agent/permission/policies/default-tool-approve.ts index 2f8355ce0..6989fd229 100644 --- a/packages/agent-core/src/agent/permission/policies/default-tool-approve.ts +++ b/packages/agent-core/src/agent/permission/policies/default-tool-approve.ts @@ -4,7 +4,6 @@ const DEFAULT_APPROVE_TOOLS = new Set([ 'Read', 'Grep', 'Glob', - 'ReadMediaFile', 'SetTodoList', 'TodoList', 'TaskList', diff --git a/packages/agent-core/src/agent/tool/index.ts b/packages/agent-core/src/agent/tool/index.ts index 33679f88c..37dd1377c 100644 --- a/packages/agent-core/src/agent/tool/index.ts +++ b/packages/agent-core/src/agent/tool/index.ts @@ -378,7 +378,7 @@ export class ToolManager { const goalToolsEnabled = this.agent.type === 'main'; this.builtinTools = new Map( [ - new b.ReadTool(kaos, workspace), + new b.ReadTool(kaos, workspace, modelCapabilities, videoUploader), new b.WriteTool(kaos, workspace), new b.EditTool(kaos, workspace), new b.GrepTool(kaos, workspace), @@ -386,8 +386,6 @@ export class ToolManager { new b.BashTool(kaos, cwd, background, { allowBackground, }), - (modelCapabilities.image_in || modelCapabilities.video_in) && - new b.ReadMediaFileTool(kaos, workspace, modelCapabilities, videoUploader), new b.EnterPlanModeTool(this.agent), new b.ExitPlanModeTool(this.agent), // Goal tools are main-agent-only. diff --git a/packages/agent-core/src/mcp/output.ts b/packages/agent-core/src/mcp/output.ts index 9832b04d2..dba169f31 100644 --- a/packages/agent-core/src/mcp/output.ts +++ b/packages/agent-core/src/mcp/output.ts @@ -7,7 +7,7 @@ * (dropping unsupported shapes). * 2. Wrap media-only outputs in `` tags so the * model can attribute binary output when several tools return media. - * Mirrors the in-tree `ReadMediaFile` convention. + * Mirrors the in-tree `Read` media convention. * 3. Apply size limits: text/think share a 100K character budget; binary * parts (image/audio/video URLs) each carry an independent 10 MB cap and * collapse to a notice when oversize, so a single screenshot cannot diff --git a/packages/agent-core/src/profile/default/agent.yaml b/packages/agent-core/src/profile/default/agent.yaml index 794698b97..ec9a744ee 100644 --- a/packages/agent-core/src/profile/default/agent.yaml +++ b/packages/agent-core/src/profile/default/agent.yaml @@ -18,7 +18,6 @@ tools: - CronCreate - CronList - CronDelete - - ReadMediaFile - TodoList - Skill - WebSearch diff --git a/packages/agent-core/src/profile/default/coder.yaml b/packages/agent-core/src/profile/default/coder.yaml index 22780c65c..cb33b4500 100644 --- a/packages/agent-core/src/profile/default/coder.yaml +++ b/packages/agent-core/src/profile/default/coder.yaml @@ -8,7 +8,6 @@ whenToUse: | tools: - Bash - Read - - ReadMediaFile - Glob - Grep - Write diff --git a/packages/agent-core/src/profile/default/explore.yaml b/packages/agent-core/src/profile/default/explore.yaml index 84d024e58..551a2937f 100644 --- a/packages/agent-core/src/profile/default/explore.yaml +++ b/packages/agent-core/src/profile/default/explore.yaml @@ -29,7 +29,6 @@ whenToUse: | tools: - Bash - Read - - ReadMediaFile - Glob - Grep - WebSearch diff --git a/packages/agent-core/src/profile/default/plan.yaml b/packages/agent-core/src/profile/default/plan.yaml index beb3e1c2d..88923bbfc 100644 --- a/packages/agent-core/src/profile/default/plan.yaml +++ b/packages/agent-core/src/profile/default/plan.yaml @@ -12,7 +12,6 @@ whenToUse: | Use this agent when the parent agent needs a step-by-step implementation plan, key file identification, and architectural trade-off analysis before code changes are made. tools: - Read - - ReadMediaFile - Glob - Grep - WebSearch diff --git a/packages/agent-core/src/tools/builtin/file/read-media.md b/packages/agent-core/src/tools/builtin/file/read-media.md deleted file mode 100644 index 0ff49fb6c..000000000 --- a/packages/agent-core/src/tools/builtin/file/read-media.md +++ /dev/null @@ -1,13 +0,0 @@ -Read media content from a file. - -**Tips:** -- Make sure you follow the description of each tool parameter. -- A `` tag is given before the file content; it summarizes the mime type, byte size and, for images, the original pixel dimensions. When outputting coordinates, give relative coordinates first and compute absolute coordinates from the original image size. After generating or editing media via commands or scripts, read the result back before continuing. -- The system will notify you when there is anything wrong when reading the file. -- This tool is a tool that you typically want to use in parallel. Always read multiple files in one response when possible. -- This tool can only read image or video files. To read text files, use the Read tool. To list directories, use `ls` via Bash for a known directory, or Glob for pattern search. -- If the file doesn't exist or path is invalid, an error will be returned. -- The maximum size that can be read is {{ MAX_MEDIA_MEGABYTES }}MB. An error will be returned if the file is larger than this limit. -- The media content will be returned in a form that you can directly view and understand. - -**Capabilities** \ No newline at end of file diff --git a/packages/agent-core/src/tools/builtin/file/read-media.ts b/packages/agent-core/src/tools/builtin/file/read-media.ts deleted file mode 100644 index 3b206d247..000000000 --- a/packages/agent-core/src/tools/builtin/file/read-media.ts +++ /dev/null @@ -1,273 +0,0 @@ -/** - * ReadMediaFileTool — read image/video files as multi-modal content. - * - * Returns a 4-part wrap: - * `[TextPart(''), TextPart(''), - * ImageContent|VideoContent, TextPart('')]` - * and gates on the model's `image_in` / `video_in` capability. - * - * The leading `` block summarizes mime type, byte size and (for - * images) original pixel dimensions, guides the model to derive absolute - * coordinates from that original size, and reminds it to re-read any media - * it generates or edits. - * - * Path safety: goes through the shared path access resolver used by - * Read/Write/Edit. - */ - -import type { Kaos } from '@moonshot-ai/kaos'; -import type { - ContentPart, - ModelCapability, - VideoURLPart, - VideoUploadInput as ProviderVideoUploadInput, -} from '@moonshot-ai/kosong'; -import { z } from 'zod'; - -import type { BuiltinTool } from '../../../agent/tool'; -import { ToolAccesses } from '../../../loop/tool-access'; -import type { ExecutableToolResult, ToolExecution } from '../../../loop/types'; -import { renderPrompt } from '../../../utils/render-prompt'; -import { resolvePathAccessPath } from '../../policies/path-access'; -import { MEDIA_SNIFF_BYTES, detectFileType, sniffImageDimensions } from '../../support/file-type'; -import { toInputJsonSchema } from '../../support/input-schema'; -import { literalRulePattern, matchesPathRuleSubject } from '../../support/rule-match'; -import type { WorkspaceConfig } from '../../support/workspace'; -import readMediaDescriptionHead from './read-media.md'; - -// ── Constants ──────────────────────────────────────────────────────── - -const MAX_MEDIA_MEGABYTES = 100; -const MAX_MEDIA_BYTES = MAX_MEDIA_MEGABYTES * 1024 * 1024; - -export type VideoUploadInput = ProviderVideoUploadInput; - -export type VideoUploader = (input: VideoUploadInput) => Promise; - -// ── Input schema ───────────────────────────────────────────────────── - -export const ReadMediaFileInputSchema = z.object({ - path: z - .string() - .describe( - 'Path to an image or video file. Relative paths resolve against the working directory; ' + - 'a path outside the working directory must be absolute. ' + - 'Directories and text files are not supported.', - ), -}); - -export type ReadMediaFileInput = z.Infer; - -// ── Tool description (capability-driven) ───────────────────────────── - -function buildDescription(capabilities: ModelCapability): string { - const head = renderPrompt(readMediaDescriptionHead, { MAX_MEDIA_MEGABYTES }); - const lines: string[] = [head]; - const hasImage = capabilities.image_in; - const hasVideo = capabilities.video_in; - if (hasImage && hasVideo) { - lines.push('- This tool supports image and video files for the current model.'); - } else if (hasImage) { - lines.push( - '- This tool supports image files for the current model.', - '- Video files are not supported by the current model.', - ); - } else if (hasVideo) { - lines.push( - '- This tool supports video files for the current model.', - '- Image files are not supported by the current model.', - ); - } else { - lines.push('- The current model does not support image or video input.'); - } - return lines.join('\n'); -} - -// ── System summary ─────────────────────────────────────────────────── - -/** - * Build the `` summary that precedes the media content. - * - * Carries mime type, byte size and (for images) the original pixel - * dimensions. When the dimensions are known it also guides the model to - * derive absolute coordinates from that original size; it always reminds - * the model to re-read any media it generates or edits. - */ -function buildSystemSummary(input: { - readonly kind: 'image' | 'video'; - readonly mimeType: string; - readonly byteSize: number; - readonly dimensions: { readonly width: number; readonly height: number } | null; -}): string { - const parts: string[] = [ - `Read ${input.kind} file.`, - `Mime type: ${input.mimeType}.`, - `Size: ${String(input.byteSize)} bytes.`, - ]; - // Coordinate guidance is only emitted when the original size is actually - // known — sniffing fails for some image formats (TIFF/ICO/HEIC/…), and - // telling the model to use a size that is not in the block would mislead it. - if (input.kind === 'image' && input.dimensions) { - parts.push( - `Original dimensions: ${String(input.dimensions.width)}x${String(input.dimensions.height)} pixels.`, - 'If you need to output coordinates, output relative coordinates first ' + - 'and compute absolute coordinates using the original image size.', - ); - } - parts.push( - 'If you generate or edit images or videos via commands or scripts, ' + - 'read the result back immediately before continuing.', - ); - return `${parts.join(' ')}`; -} - -// ── Implementation ─────────────────────────────────────────────────── - -export class ReadMediaFileTool implements BuiltinTool { - readonly name = 'ReadMediaFile' as const; - readonly description: string; - readonly parameters: Record = toInputJsonSchema(ReadMediaFileInputSchema); - constructor( - private readonly kaos: Kaos, - private readonly workspace: WorkspaceConfig, - private readonly capabilities: ModelCapability, - private readonly videoUploader?: VideoUploader | undefined, - ) { - if (!capabilities.image_in && !capabilities.video_in) { - const skip = new Error('ReadMediaFile requires image_in or video_in capability'); - skip.name = 'SkipThisTool'; - throw skip; - } - this.description = buildDescription(capabilities); - } - - resolveExecution(args: ReadMediaFileInput): ToolExecution { - const path = resolvePathAccessPath(args.path, { - kaos: this.kaos, - workspace: this.workspace, - operation: 'read', - }); - return { - accesses: ToolAccesses.readFile(path), - description: `Reading media: ${args.path}`, - display: { kind: 'file_io', operation: 'read', path }, - approvalRule: literalRulePattern(this.name, path), - matchesRule: (ruleArgs) => - matchesPathRuleSubject(ruleArgs, path, { - cwd: this.workspace.workspaceDir, - pathClass: this.kaos.pathClass(), - homeDir: this.kaos.gethome(), - }), - execute: () => this.execution(args, path), - }; - } - - private async execution( - args: ReadMediaFileInput, - safePath: string, - ): Promise { - if (!args.path) { - return { isError: true, output: 'File path cannot be empty.' }; - } - - try { - // Sniff header first — read the first 512 bytes before deciding - // anything about MIME. - const header = await this.kaos.readBytes(safePath, MEDIA_SNIFF_BYTES); - const fileType = detectFileType(safePath, header); - - if (fileType.kind === 'text') { - return { - isError: true, - output: `"${args.path}" is a text file. Use Read to read text files.`, - }; - } - if (fileType.kind === 'unknown') { - return { - isError: true, - output: - `"${args.path}" is not a supported image or video file. ` + - 'Use Read for text files, or Bash or an MCP tool for other binary formats.', - }; - } - - if (fileType.kind === 'image' && !this.capabilities.image_in) { - return { - isError: true, - output: - 'The current model does not support image input. ' + - 'Tell the user to use a model with image input capability.', - }; - } - if (fileType.kind === 'video' && !this.capabilities.video_in) { - return { - isError: true, - output: - 'The current model does not support video input. ' + - 'Tell the user to use a model with video input capability.', - }; - } - - const stat = await this.kaos.stat(safePath); - if (stat.stSize === 0) { - return { isError: true, output: `"${args.path}" is empty.` }; - } - if (stat.stSize > MAX_MEDIA_BYTES) { - return { - isError: true, - output: - `"${args.path}" is ${String(stat.stSize)} bytes, which exceeds the ` + - `maximum ${String(MAX_MEDIA_MEGABYTES)}MB for media files.`, - }; - } - - const data = await this.kaos.readBytes(safePath); - const base64 = data.toString('base64'); - let mediaPart: ContentPart; - if (fileType.kind === 'image') { - mediaPart = { - type: 'image_url', - imageUrl: { url: `data:${fileType.mimeType};base64,${base64}` }, - }; - } else if (this.videoUploader !== undefined) { - mediaPart = await this.videoUploader({ - data, - mimeType: fileType.mimeType, - filename: safePath.split(/[\\/]/).at(-1), - }); - } else { - mediaPart = { - type: 'video_url', - videoUrl: { url: `data:${fileType.mimeType};base64,${base64}` }, - }; - } - - const tag = fileType.kind === 'image' ? 'image' : 'video'; - const openText = `<${tag} path="${safePath}">`; - const closeText = ``; - - const dimensions = - fileType.kind === 'image' ? sniffImageDimensions(data) : null; - const systemText = buildSystemSummary({ - kind: fileType.kind, - mimeType: fileType.mimeType, - byteSize: stat.stSize, - dimensions, - }); - - const output: ContentPart[] = [ - { type: 'text', text: systemText }, - { type: 'text', text: openText }, - mediaPart, - { type: 'text', text: closeText }, - ]; - - return { output, isError: false }; - } catch (error) { - return { - isError: true, - output: `Failed to read ${args.path}: ${error instanceof Error ? error.message : String(error)}`, - }; - } - } -} diff --git a/packages/agent-core/src/tools/builtin/file/read.md b/packages/agent-core/src/tools/builtin/file/read.md index 79bdda810..9bf0475e3 100644 --- a/packages/agent-core/src/tools/builtin/file/read.md +++ b/packages/agent-core/src/tools/builtin/file/read.md @@ -1,17 +1,20 @@ -Read a text file from the local filesystem. +Read a file from the local filesystem. Text files return numbered lines; image and video files return media content you can view directly, subject to the model capabilities listed at the end. -If the user provides a concrete file path to a text file, call Read directly. Do not `Glob`, `ls`, or otherwise pre-check known text file paths; missing or invalid file paths return errors you can handle. Do not use Read for directories; use `ls` via Bash for a known directory, or Glob when you need files/directories matching a pattern. Use `Grep` only when the task is to search for unknown content or locations. +If the user provides a concrete file path, call Read directly. Do not `Glob`, `ls`, or otherwise pre-check known file paths; missing or invalid file paths return errors you can handle. Do not use Read for directories; use `ls` via Bash for a known directory, or Glob when you need files/directories matching a pattern. Use `Grep` only when the task is to search for unknown content or locations. When you need several files, prefer to read them in parallel: emit multiple `Read` calls in a single response instead of reading one file per turn. - Relative paths resolve against the working directory; a path outside the working directory must be absolute. -- Returns up to {{ MAX_LINES }} lines or {{ MAX_BYTES_KB }} KB per call, whichever comes first; lines longer than {{ MAX_LINE_LENGTH }} chars are truncated mid-line. -- Page larger files with `line_offset` (1-based start line) and `n_lines`. Omit `n_lines` to read up to the {{ MAX_LINES }}-line cap. +- Text files return up to {{ MAX_LINES }} lines or {{ MAX_BYTES_KB }} KB per call, whichever comes first; lines longer than {{ MAX_LINE_LENGTH }} chars are truncated mid-line. +- Page larger text files with `line_offset` (1-based start line) and `n_lines`. Omit `n_lines` to read up to the {{ MAX_LINES }}-line cap. - Sensitive files (`.env` files, credential stores, SSH keys, and similar secrets) are refused to protect secrets; do not attempt to read them. -- Only UTF-8 text files can be read. Non-UTF-8 encodings, binary files, and files containing NUL bytes are refused; use `ReadMediaFile` for images or video, and Bash or an MCP tool for other binary formats. +- Only UTF-8 text files can be read as text. Non-UTF-8 encodings and binary files that are not images or videos are refused; use Bash or an MCP tool for those formats. - Negative line_offset reads from the end of the file (for example, -100 reads the last 100 lines); the absolute value cannot exceed {{ MAX_LINES }}. -- Output format: `\t` per line. -- A `...` status block is appended after the file content; it summarizes how much was read (line and byte counts, truncation, line-ending notes) and is not part of the file itself. +- Text output format: `\t` per line. +- A `...` status block is appended after the file content for text reads; it summarizes how much was read (line and byte counts, truncation, line-ending notes) and is not part of the file itself. - Pure CRLF files are displayed with LF line endings; `Edit` matches this output and preserves CRLF when writing back. - Mixed or lone carriage-return line endings are shown as `\r` and require exact `Edit.old_string` escapes. +- Image and video files are detected by extension and magic bytes and returned as media content; `line_offset` and `n_lines` are ignored for them. The maximum media file size is {{ MAX_MEDIA_MEGABYTES }}MB. +- Media content is preceded by a `` block summarizing the mime type, byte size and, for images, the original pixel dimensions. When outputting coordinates, give relative coordinates first and compute absolute coordinates from the original image size. +- After generating or editing an image or video via commands or scripts, read the result back immediately before continuing. - After a successful `Edit`/`Write`, do not re-read solely to prove the write landed. When the task depends on an exact file, API, or output shape, inspect the final external contract before finishing. diff --git a/packages/agent-core/src/tools/builtin/file/read.ts b/packages/agent-core/src/tools/builtin/file/read.ts index e9aa6472f..b477b5554 100644 --- a/packages/agent-core/src/tools/builtin/file/read.ts +++ b/packages/agent-core/src/tools/builtin/file/read.ts @@ -1,4 +1,27 @@ +/** + * ReadTool — read a file from the local filesystem. + * + * Text files return numbered lines plus a trailing `` status + * block. Image and video files return a 4-part multi-modal wrap: + * `[TextPart(''), TextPart(''), + * ImageContent|VideoContent, TextPart('')]` + * gated on the model's `image_in` / `video_in` capability. The file kind + * is decided by extension + magic-byte sniffing, so the model never has + * to guess the kind before calling. + * + * The leading media `` block summarizes mime type, byte size and + * (for images) original pixel dimensions, guides the model to derive + * absolute coordinates from that original size, and reminds it to + * re-read any media it generates or edits. + */ + import type { Kaos, StatResult } from '@moonshot-ai/kaos'; +import type { + ContentPart, + ModelCapability, + VideoURLPart, + VideoUploadInput as ProviderVideoUploadInput, +} from '@moonshot-ai/kosong'; import { z } from 'zod'; import type { BuiltinTool } from '../../../agent/tool'; @@ -6,7 +29,7 @@ import { ToolAccesses } from '../../../loop/tool-access'; import type { ExecutableToolResult, ToolExecution } from '../../../loop/types'; import { renderPrompt } from '../../../utils/render-prompt'; import { resolvePathAccessPath } from '../../policies/path-access'; -import { MEDIA_SNIFF_BYTES, detectFileType } from '../../support/file-type'; +import { MEDIA_SNIFF_BYTES, detectFileType, sniffImageDimensions } from '../../support/file-type'; import { toInputJsonSchema } from '../../support/input-schema'; import { literalRulePattern, matchesPathRuleSubject } from '../../support/rule-match'; import type { WorkspaceConfig } from '../../support/workspace'; @@ -16,9 +39,15 @@ import readDescriptionTemplate from './read.md'; export const MAX_LINES: number = 1000; export const MAX_LINE_LENGTH: number = 2000; export const MAX_BYTES: number = 100 * 1024; +export const MAX_MEDIA_MEGABYTES: number = 100; +const MAX_MEDIA_BYTES = MAX_MEDIA_MEGABYTES * 1024 * 1024; const S_IFMT = 0o170000; const S_IFREG = 0o100000; +export type VideoUploadInput = ProviderVideoUploadInput; + +export type VideoUploader = (input: VideoUploadInput) => Promise; + const PositiveLineOffsetSchema = z.number().int().min(1); const TailLineOffsetSchema = z.number().int().min(-MAX_LINES).max(-1); @@ -26,13 +55,13 @@ export const ReadInputSchema = z.object({ path: z .string() .describe( - 'Path to a text file. Relative paths resolve against the working directory; a path outside the working directory must be absolute. Directories are not supported; use `ls` via Bash for a known directory, or Glob for pattern search.', + 'Path to a file. Relative paths resolve against the working directory; a path outside the working directory must be absolute. Directories are not supported; use `ls` via Bash for a known directory, or Glob for pattern search.', ), line_offset: z .union([PositiveLineOffsetSchema, TailLineOffsetSchema]) .optional() .describe( - `The line number to start reading from. Omit to start at line 1. Negative values read from the end of the file; the absolute value cannot exceed ${String(MAX_LINES)}.`, + `The line number to start reading from. Omit to start at line 1. Negative values read from the end of the file; the absolute value cannot exceed ${String(MAX_LINES)}. Ignored for image and video files.`, ), n_lines: z .number() @@ -40,7 +69,7 @@ export const ReadInputSchema = z.object({ .positive() .optional() .describe( - `The number of lines to read; the tool also applies its internal cap. Omit to read up to the internal cap of ${String(MAX_LINES)} lines.`, + `The number of lines to read; the tool also applies its internal cap. Omit to read up to the internal cap of ${String(MAX_LINES)} lines. Ignored for image and video files.`, ), }); @@ -80,11 +109,19 @@ interface FinishReadResultInput { } type TextPreviewKaos = Kaos & { - readTextPreview?: (path: string, n: number) => Promise; + readTextPreview: (path: string, n: number) => Promise; }; -async function readTextHeader(kaos: TextPreviewKaos, path: string, n: number): Promise { - if (kaos.readTextPreview !== undefined) { +function isMediaFileType(kind: 'text' | 'image' | 'video' | 'unknown'): kind is 'image' | 'video' { + return kind === 'image' || kind === 'video'; +} + +function hasTextPreview(kaos: Kaos): kaos is TextPreviewKaos { + return typeof (kaos as { readTextPreview?: unknown }).readTextPreview === 'function'; +} + +async function readTextHeader(kaos: Kaos, path: string, n: number): Promise { + if (hasTextPreview(kaos)) { return kaos.readTextPreview(path, n); } return kaos.readBytes(path, n); @@ -166,25 +203,88 @@ function containsNulByte(text: string): boolean { function notReadableFileOutput(path: string): string { return ( `"${path}" is not readable as UTF-8 text. ` + - 'If it is an image or video, use ReadMediaFile. ' + - 'For other binary formats, use Bash or an MCP tool if available.' + 'For binary formats, use Bash or an MCP tool if available.' ); } -const READ_DESCRIPTION = renderPrompt(readDescriptionTemplate, { - MAX_LINES, - MAX_BYTES_KB: MAX_BYTES / 1024, - MAX_LINE_LENGTH, -}); +function buildDescription(capabilities: ModelCapability): string { + const head = renderPrompt(readDescriptionTemplate, { + MAX_LINES, + MAX_BYTES_KB: MAX_BYTES / 1024, + MAX_LINE_LENGTH, + MAX_MEDIA_MEGABYTES, + }); + const lines: string[] = [head]; + const hasImage = capabilities.image_in; + const hasVideo = capabilities.video_in; + if (hasImage && hasVideo) { + lines.push('- This tool supports image and video files for the current model.'); + } else if (hasImage) { + lines.push( + '- This tool supports image files for the current model.', + '- Video files are not supported by the current model.', + ); + } else if (hasVideo) { + lines.push( + '- This tool supports video files for the current model.', + '- Image files are not supported by the current model.', + ); + } else { + lines.push( + '- The current model does not support image or video input; reading an image or video file returns an error.', + ); + } + return lines.join('\n'); +} + +/** + * Build the `` summary that precedes the media content. + * + * Carries mime type, byte size and (for images) the original pixel + * dimensions. When the dimensions are known it also guides the model to + * derive absolute coordinates from that original size; it always reminds + * the model to re-read any media it generates or edits. + */ +function buildSystemSummary(input: { + readonly kind: 'image' | 'video'; + readonly mimeType: string; + readonly byteSize: number; + readonly dimensions: { readonly width: number; readonly height: number } | null; +}): string { + const parts: string[] = [ + `Read ${input.kind} file.`, + `Mime type: ${input.mimeType}.`, + `Size: ${String(input.byteSize)} bytes.`, + ]; + // Coordinate guidance is only emitted when the original size is actually + // known — sniffing fails for some image formats (TIFF/ICO/HEIC/…), and + // telling the model to use a size that is not in the block would mislead it. + if (input.kind === 'image' && input.dimensions) { + parts.push( + `Original dimensions: ${String(input.dimensions.width)}x${String(input.dimensions.height)} pixels.`, + 'If you need to output coordinates, output relative coordinates first ' + + 'and compute absolute coordinates using the original image size.', + ); + } + parts.push( + 'If you generate or edit images or videos via commands or scripts, ' + + 'read the result back immediately before continuing.', + ); + return `${parts.join(' ')}`; +} export class ReadTool implements BuiltinTool { readonly name = 'Read' as const; - readonly description = READ_DESCRIPTION; + readonly description: string; readonly parameters: Record = toInputJsonSchema(ReadInputSchema); constructor( private readonly kaos: Kaos, private readonly workspace: WorkspaceConfig, - ) {} + private readonly capabilities: ModelCapability, + private readonly videoUploader?: VideoUploader | undefined, + ) { + this.description = buildDescription(capabilities); + } resolveExecution(args: ReadInput): ToolExecution { const path = resolvePathAccessPath(args.path, { @@ -222,13 +322,18 @@ export class ReadTool implements BuiltinTool { return { isError: true, output: `"${args.path}" is not a file.` }; } - const header = await readTextHeader(this.kaos, safePath, MEDIA_SNIFF_BYTES); - const fileType = detectFileType(safePath, header); - if (fileType.kind === 'image' || fileType.kind === 'video') { - return { - isError: true, - output: `"${args.path}" is a ${fileType.kind} file. Use ReadMediaFile to read image or video files.`, - }; + const fileType = await this.detectFileTypeForRead(safePath); + if (isMediaFileType(fileType.kind)) { + try { + return await this.readMedia(args, safePath, fileType.kind, fileType.mimeType, stat); + } catch (error) { + // Media failures surface provider errors (e.g. a failed video + // upload); a bare message loses which file and step failed. + return { + isError: true, + output: `Failed to read ${args.path}: ${error instanceof Error ? error.message : String(error)}`, + }; + } } if (fileType.kind === 'unknown') { return { @@ -268,6 +373,103 @@ export class ReadTool implements BuiltinTool { } } + private async detectFileTypeForRead( + safePath: string, + ): Promise> { + const extensionHint = detectFileType(safePath); + if (isMediaFileType(extensionHint.kind)) { + const rawHeader = await this.kaos.readBytes(safePath, MEDIA_SNIFF_BYTES); + return detectFileType(safePath, rawHeader); + } + + try { + const header = await readTextHeader(this.kaos, safePath, MEDIA_SNIFF_BYTES); + return detectFileType(safePath, header); + } catch (error) { + if (!hasTextPreview(this.kaos)) throw error; + const rawHeader = await this.kaos.readBytes(safePath, MEDIA_SNIFF_BYTES); + return detectFileType(safePath, rawHeader); + } + } + + private async readMedia( + args: ReadInput, + safePath: string, + kind: 'image' | 'video', + mimeType: string, + stat: StatResult, + ): Promise { + if (kind === 'image' && !this.capabilities.image_in) { + return { + isError: true, + output: + 'The current model does not support image input. ' + + 'Tell the user to use a model with image input capability.', + }; + } + if (kind === 'video' && !this.capabilities.video_in) { + return { + isError: true, + output: + 'The current model does not support video input. ' + + 'Tell the user to use a model with video input capability.', + }; + } + + if (stat.stSize === 0) { + return { isError: true, output: `"${args.path}" is empty.` }; + } + if (stat.stSize > MAX_MEDIA_BYTES) { + return { + isError: true, + output: + `"${args.path}" is ${String(stat.stSize)} bytes, which exceeds the ` + + `maximum ${String(MAX_MEDIA_MEGABYTES)}MB for media files.`, + }; + } + + const data = await this.kaos.readBytes(safePath); + const base64 = data.toString('base64'); + let mediaPart: ContentPart; + if (kind === 'image') { + mediaPart = { + type: 'image_url', + imageUrl: { url: `data:${mimeType};base64,${base64}` }, + }; + } else if (this.videoUploader !== undefined) { + mediaPart = await this.videoUploader({ + data, + mimeType, + filename: safePath.split(/[\\/]/).at(-1), + }); + } else { + mediaPart = { + type: 'video_url', + videoUrl: { url: `data:${mimeType};base64,${base64}` }, + }; + } + + const openText = `<${kind} path="${safePath}">`; + const closeText = ``; + + const dimensions = kind === 'image' ? sniffImageDimensions(data) : null; + const systemText = buildSystemSummary({ + kind, + mimeType, + byteSize: stat.stSize, + dimensions, + }); + + const output: ContentPart[] = [ + { type: 'text', text: systemText }, + { type: 'text', text: openText }, + mediaPart, + { type: 'text', text: closeText }, + ]; + + return { output, isError: false }; + } + private async readForward( safePath: string, displayPath: string, diff --git a/packages/agent-core/src/tools/builtin/index.ts b/packages/agent-core/src/tools/builtin/index.ts index 744f90c6f..cb47081b1 100644 --- a/packages/agent-core/src/tools/builtin/index.ts +++ b/packages/agent-core/src/tools/builtin/index.ts @@ -12,7 +12,6 @@ export * from './file/edit'; export * from './file/glob'; export * from './file/grep'; export * from './file/read'; -export * from './file/read-media'; export * from './file/write'; export * from './goal/create-goal'; export * from './goal/get-goal'; diff --git a/packages/agent-core/test/agent/permission.test.ts b/packages/agent-core/test/agent/permission.test.ts index 1246a21ff..5c3646807 100644 --- a/packages/agent-core/test/agent/permission.test.ts +++ b/packages/agent-core/test/agent/permission.test.ts @@ -472,7 +472,6 @@ describe('Permission auto mode', () => { (['manual', 'yolo'] as const).flatMap((mode) => [ [mode, 'Read', { path: '/tmp/notes.md' }], - [mode, 'ReadMediaFile', { path: '/tmp/image.png' }], [mode, 'Grep', { pattern: 'TODO', path: '/tmp' }], ] as const, ), @@ -494,7 +493,6 @@ describe('Permission auto mode', () => { it.each([ ['Read', { path: '/tmp/notes.md' }], - ['ReadMediaFile', { path: '/tmp/image.png' }], ['Write', { path: '/tmp/notes.md', content: 'x' }], ['Edit', { path: '/tmp/notes.md', old_string: 'a', new_string: 'b' }], ] as const)('approves %s outside the cwd in auto mode', async (toolName, args) => { @@ -521,7 +519,6 @@ describe('Permission auto mode', () => { it.each([ ['Read', { path: '/workspace/notes.md' }], - ['ReadMediaFile', { path: '/workspace/image.png' }], ['Write', { path: '/workspace/notes.md', content: 'x' }], ['Edit', { path: '/workspace/notes.md', old_string: 'a', new_string: 'b' }], ['Grep', { pattern: 'TODO', path: '/workspace' }], @@ -915,7 +912,6 @@ describe('Default tool approve policy', () => { ['Read', { path: '/workspace/notes.md' }], ['Grep', { pattern: 'TODO', path: '/workspace' }], ['Glob', { pattern: '**/*.ts', path: '/workspace' }], - ['ReadMediaFile', { path: '/workspace/image.png' }], ['SetTodoList', { items: [] }], ['TodoList', {}], ['TaskList', {}], @@ -3742,7 +3738,6 @@ function testRuleSubject(toolName: string, args: Record): strin case 'Bash': return stringArg(args, 'command'); case 'Read': - case 'ReadMediaFile': case 'Write': case 'Edit': return canonicalTestPath(stringArg(args, 'path', '/workspace/file.txt')); @@ -3761,7 +3756,6 @@ function testMatchesRuleSubject( ): boolean { switch (toolName) { case 'Read': - case 'ReadMediaFile': case 'Write': case 'Edit': return matchesPathRuleSubject(ruleArgs, ruleSubject); @@ -3778,8 +3772,6 @@ function testDescription(toolName: string, args: Record): strin return 'review plan'; case 'Read': return 'read file'; - case 'ReadMediaFile': - return 'read media file'; case 'Write': return 'write file'; case 'Edit': @@ -3807,7 +3799,6 @@ function testDisplay(toolName: string, args: Record): ToolInput command: typeof args['command'] === 'string' ? args['command'] : '', }; case 'Read': - case 'ReadMediaFile': return { kind: 'file_io', operation: 'read', path }; case 'Write': return { kind: 'file_io', operation: 'write', path }; @@ -3825,7 +3816,6 @@ function testDisplay(toolName: string, args: Record): ToolInput function testAccesses(toolName: string, args: Record) { const path = typeof args['path'] === 'string' ? canonicalTestPath(args['path']) : undefined; if (toolName === 'Read' && path !== undefined) return ToolAccesses.readFile(path); - if (toolName === 'ReadMediaFile' && path !== undefined) return ToolAccesses.readFile(path); if (toolName === 'Write' && path !== undefined) return ToolAccesses.writeFile(path); if (toolName === 'Edit' && path !== undefined) return ToolAccesses.readWriteFile(path); if ((toolName === 'Grep' || toolName === 'Glob') && path !== undefined) { diff --git a/packages/agent-core/test/agent/turn.test.ts b/packages/agent-core/test/agent/turn.test.ts index 8afa05ecc..a368df9d5 100644 --- a/packages/agent-core/test/agent/turn.test.ts +++ b/packages/agent-core/test/agent/turn.test.ts @@ -1376,10 +1376,10 @@ describe('Agent turn flow', () => { get: () => provider, }); ctx.agent.tools.initializeBuiltinTools(); - ctx.agent.tools.setActiveTools(['ReadMediaFile']); + ctx.agent.tools.setActiveTools(['Read']); - const tool = ctx.agent.tools.loopTools.find((candidate) => candidate.name === 'ReadMediaFile'); - if (tool === undefined) throw new Error('ReadMediaFile tool was not initialized'); + const tool = ctx.agent.tools.loopTools.find((candidate) => candidate.name === 'Read'); + if (tool === undefined) throw new Error('Read tool was not initialized'); const result = await executeTool(tool, { turnId: 't1', toolCallId: 'call_media', diff --git a/packages/agent-core/test/prompt-placeholders.test.ts b/packages/agent-core/test/prompt-placeholders.test.ts index 4068979b5..9557bfda6 100644 --- a/packages/agent-core/test/prompt-placeholders.test.ts +++ b/packages/agent-core/test/prompt-placeholders.test.ts @@ -25,7 +25,6 @@ const TEMPLATED = new Set([ 'profile/default/system.md', 'agent/compaction/compaction-instruction.md', 'tools/builtin/file/read.md', - 'tools/builtin/file/read-media.md', 'tools/builtin/shell/bash.md', 'tools/builtin/collaboration/skill-tool.md', ]); diff --git a/packages/agent-core/test/tools/builtin-current.test.ts b/packages/agent-core/test/tools/builtin-current.test.ts index 4619221da..9dfaa084c 100644 --- a/packages/agent-core/test/tools/builtin-current.test.ts +++ b/packages/agent-core/test/tools/builtin-current.test.ts @@ -36,7 +36,7 @@ import { ReadInputSchema, ReadTool } from '../../src/tools/builtin/file/read'; import { WriteInputSchema, WriteTool } from '../../src/tools/builtin/file/write'; import { BashInputSchema, BashTool } from '../../src/tools/builtin/shell/bash'; import type { WorkspaceConfig } from '../../src/tools/support/workspace'; -import { createFakeKaos } from './fixtures/fake-kaos'; +import { createFakeKaos, FULL_MEDIA_CAPABILITIES } from './fixtures/fake-kaos'; import { executeTool } from './fixtures/execute-tool'; import { createBackgroundManager } from '../agent/background/helpers'; import { @@ -111,6 +111,7 @@ describe('current builtin file and shell tools', () => { }), }), workspace, + FULL_MEDIA_CAPABILITIES, ); expect(ReadInputSchema.safeParse({ path: '/workspace/a.txt' }).success).toBe(true); diff --git a/packages/agent-core/test/tools/fixtures/fake-kaos.ts b/packages/agent-core/test/tools/fixtures/fake-kaos.ts index 5a22938b8..d49703aa9 100644 --- a/packages/agent-core/test/tools/fixtures/fake-kaos.ts +++ b/packages/agent-core/test/tools/fixtures/fake-kaos.ts @@ -12,6 +12,7 @@ */ import type { Environment, Kaos } from '@moonshot-ai/kaos'; +import type { ModelCapability } from '@moonshot-ai/kosong'; import type { ExecutableToolResult } from '#/loop'; import type { WorkspaceConfig } from '../../../src/tools/support/workspace'; @@ -64,6 +65,20 @@ export const PERMISSIVE_WORKSPACE: WorkspaceConfig = { additionalDirs: [], }; +/** + * Full-modality capability set for ReadTool construction — most Read + * tests exercise text or media behaviour without caring about the + * capability gate; tests that do care build their own. + */ +export const FULL_MEDIA_CAPABILITIES: ModelCapability = { + image_in: true, + video_in: true, + audio_in: false, + thinking: false, + tool_use: true, + max_context_tokens: 0, +}; + /** * Assert that a `ToolResult`'s `content` is a string and return it. * Keeps the lint rule `typescript-eslint(no-base-to-string)` happy by diff --git a/packages/agent-core/test/tools/read-file.test.ts b/packages/agent-core/test/tools/read-file.test.ts index 35337bb70..19a53a915 100644 --- a/packages/agent-core/test/tools/read-file.test.ts +++ b/packages/agent-core/test/tools/read-file.test.ts @@ -2,7 +2,7 @@ import type { Kaos } from '@moonshot-ai/kaos'; import { describe, expect, it, vi } from 'vitest'; import { ReadTool } from '../../src/tools/builtin/file/read'; -import { createFakeKaos, PERMISSIVE_WORKSPACE } from './fixtures/fake-kaos'; +import { createFakeKaos, FULL_MEDIA_CAPABILITIES, PERMISSIVE_WORKSPACE } from './fixtures/fake-kaos'; import { executeTool } from './fixtures/execute-tool'; const signal = new AbortController().signal; @@ -47,6 +47,7 @@ function toolWithContent(content: string): ReadTool { readLines: vi.fn().mockImplementation(readLinesFromContent(content)), }), PERMISSIVE_WORKSPACE, + FULL_MEDIA_CAPABILITIES, ); } diff --git a/packages/agent-core/test/tools/read-media-desc.test.ts b/packages/agent-core/test/tools/read-media-desc.test.ts deleted file mode 100644 index 84cc17f2a..000000000 --- a/packages/agent-core/test/tools/read-media-desc.test.ts +++ /dev/null @@ -1,46 +0,0 @@ -import type { ModelCapability } from '@moonshot-ai/kosong'; -import { describe, expect, it } from 'vitest'; - -import { ReadMediaFileTool } from '../../src/tools/builtin/file/read-media'; -import { createFakeKaos, PERMISSIVE_WORKSPACE } from './fixtures/fake-kaos'; - -function capability(input: Partial): ModelCapability { - return input as ModelCapability; -} - -function makeTool(capabilities: Partial): ReadMediaFileTool { - return new ReadMediaFileTool(createFakeKaos(), PERMISSIVE_WORKSPACE, capability(capabilities)); -} - -describe('ReadMediaFileTool description by capabilities', () => { - it('mentions image and video when both capabilities are present', () => { - const tool = makeTool({ image_in: true, video_in: true }); - expect(tool.description).toContain('supports image and video'); - }); - - it('mentions image but flags video unsupported when only image_in is present', () => { - const tool = makeTool({ image_in: true, video_in: false }); - expect(tool.description).toContain('supports image files for the current model'); - expect(tool.description).toContain('Video files are not supported'); - }); - - it('mentions video but flags image unsupported when only video_in is present', () => { - const tool = makeTool({ image_in: false, video_in: true }); - expect(tool.description).toContain('supports video files for the current model'); - expect(tool.description).toContain('Image files are not supported'); - }); - - it('throws when no image/video capability is present', () => { - expect(() => makeTool({ image_in: false, video_in: false })).toThrow(/image_in or video_in/); - }); - - it('description pins the stable contract phrases: image+video, 100MB, parallel reads, Read pointer', () => { - const tool = makeTool({ image_in: true, video_in: true }); - expect(tool.description).toContain('image and video'); - expect(tool.description).toContain('100MB'); - expect(tool.description).toContain('parallel'); - // TS renamed the sibling tool to `Read` (py was `ReadFile`); the - // description must still point readers at the text-file tool. - expect(tool.description).toContain('Read tool'); - }); -}); diff --git a/packages/agent-core/test/tools/read-media.test.ts b/packages/agent-core/test/tools/read-media.test.ts index 59913a1e4..364467fa1 100644 --- a/packages/agent-core/test/tools/read-media.test.ts +++ b/packages/agent-core/test/tools/read-media.test.ts @@ -1,5 +1,7 @@ /** - * ReadMediaFileTool tests for the current output/capability contract. + * ReadTool media-path tests — image/video output envelope, capability + * gating, size limits, the video upload hook, and the capability-driven + * tool description. */ import type { Kaos } from '@moonshot-ai/kaos'; @@ -8,10 +10,7 @@ import { describe, expect, it, vi } from 'vitest'; import { ToolAccesses } from '../../src/loop'; import type { ExecutableToolResult } from '../../src/loop'; -import { - ReadMediaFileInputSchema, - ReadMediaFileTool, -} from '../../src/tools/builtin/file/read-media'; +import { ReadInputSchema, ReadTool } from '../../src/tools/builtin/file/read'; import { MEDIA_SNIFF_BYTES } from '../../src/tools/support/file-type'; import { createFakeKaos, PERMISSIVE_WORKSPACE } from './fixtures/fake-kaos'; import { executeTool } from './fixtures/execute-tool'; @@ -56,18 +55,16 @@ function makeReadMediaTool( input: { readonly stat?: Kaos['stat'] | undefined; readonly readBytes?: Kaos['readBytes'] | undefined; + readonly readLines?: Kaos['readLines'] | undefined; readonly modelCapabilities?: ModelCapability | undefined; } = {}, -): ReadMediaFileTool { +): ReadTool { const kaos = createFakeKaos({ stat: input.stat ?? vi.fn().mockResolvedValue(DEFAULT_STAT), readBytes: input.readBytes ?? vi.fn().mockResolvedValue(PNG_HEADER), + readLines: input.readLines, }); - return new ReadMediaFileTool( - kaos, - PERMISSIVE_WORKSPACE, - input.modelCapabilities ?? capabilities(), - ); + return new ReadTool(kaos, PERMISSIVE_WORKSPACE, input.modelCapabilities ?? capabilities()); } function outputParts(result: ExecutableToolResult): ContentPart[] { @@ -76,14 +73,12 @@ function outputParts(result: ExecutableToolResult): ContentPart[] { return result.output as ContentPart[]; } -describe('ReadMediaFileTool', () => { +describe('ReadTool media path', () => { it('has name, parameters, and path-scoped resource accesses', () => { const tool = makeReadMediaTool(); - expect(tool.name).toBe('ReadMediaFile'); - expect(ReadMediaFileInputSchema.safeParse({ path: '/workspace/sample.png' }).success).toBe( - true, - ); + expect(tool.name).toBe('Read'); + expect(ReadInputSchema.safeParse({ path: '/workspace/sample.png' }).success).toBe(true); expect(tool.parameters).toMatchObject({ type: 'object', properties: { @@ -107,19 +102,8 @@ describe('ReadMediaFileTool', () => { // working directory — not the misleading "Absolute path" wording. expect(description).toMatch(/working directory/i); expect(description).not.toMatch(/^Absolute path/); - // The useful "directories and text files are not supported" note stays. - expect(description).toMatch(/text file/i); - }); - - it('throws when constructed without image or video capability', () => { - expect( - () => - new ReadMediaFileTool( - createFakeKaos(), - PERMISSIVE_WORKSPACE, - capabilities({ image_in: false, video_in: false }), - ), - ).toThrow(/image_in or video_in/); + // The useful "directories are not supported" note stays. + expect(description).toMatch(/directories are not supported/i); }); it('returns a system/text/image/text wrap for PNG files', async () => { @@ -148,6 +132,52 @@ describe('ReadMediaFileTool', () => { expect(parts[3]).toEqual({ type: 'text', text: '' }); }); + it('sniffs media from raw bytes before ACP text preview', async () => { + const data = Buffer.concat([PNG_HEADER, Buffer.from('pngdata')]); + const readTextPreview = vi.fn(async () => { + throw new Error('ACP text preview must not be used for media sniffing'); + }); + const tool = new ReadTool( + createFakeKaos({ + stat: vi.fn().mockResolvedValue({ ...DEFAULT_STAT, stSize: data.length }), + readBytes: vi.fn().mockResolvedValue(data), + readTextPreview, + } as unknown as Partial), + PERMISSIVE_WORKSPACE, + capabilities(), + ); + + const result = await executeTool(tool, { + turnId: 't1', + toolCallId: 'c_acp_media', + args: { path: '/workspace/sample.png' }, + signal, + }); + + const parts = outputParts(result); + expect(parts[2]).toMatchObject({ type: 'image_url' }); + expect(readTextPreview).not.toHaveBeenCalled(); + }); + + it('ignores pagination parameters for media files', async () => { + const data = Buffer.concat([PNG_HEADER, Buffer.from('pngdata')]); + const tool = makeReadMediaTool({ + stat: vi.fn().mockResolvedValue({ ...DEFAULT_STAT, stSize: data.length }), + readBytes: vi.fn().mockResolvedValue(data), + }); + + const result = await executeTool(tool, { + turnId: 't1', + toolCallId: 'c_page', + args: { path: '/workspace/sample.png', line_offset: 10, n_lines: 5 }, + signal, + }); + + const parts = outputParts(result); + expect(parts).toHaveLength(4); + expect(parts[2]).toMatchObject({ type: 'image_url' }); + }); + it('emits a summary with mime type and byte size for images', async () => { const data = Buffer.concat([PNG_HEADER, Buffer.from('pngdata')]); const tool = makeReadMediaTool({ @@ -323,7 +353,7 @@ describe('ReadMediaFileTool', () => { type: 'video_url', videoUrl: { url: 'ms://file-123', id: 'file-123' }, }); - const tool = new ReadMediaFileTool( + const tool = new ReadTool( createFakeKaos({ stat: vi.fn().mockResolvedValue({ ...DEFAULT_STAT, @@ -355,11 +385,44 @@ describe('ReadMediaFileTool', () => { }); }); - it('rejects text files with a Read hint', async () => { - const text = Buffer.from('hello'); + it('reports the file path when the video upload fails', async () => { + // A bare provider error ("404 route not found") tells the model and + // the user nothing about which file or which step failed; the media + // path must wrap it with read context. + const videoUploader = vi.fn().mockRejectedValue(new Error('404 route not found')); + const tool = new ReadTool( + createFakeKaos({ + stat: vi.fn().mockResolvedValue({ + ...DEFAULT_STAT, + stSize: MP4_HEADER.length, + }), + readBytes: vi.fn().mockResolvedValue(MP4_HEADER), + }), + PERMISSIVE_WORKSPACE, + capabilities(), + videoUploader, + ); + + const result = await executeTool(tool, { + turnId: 't1', + toolCallId: 'c_upload_fail', + args: { path: '/workspace/clip.mov' }, + signal, + }); + + expect(result.isError).toBe(true); + expect(result.output).toContain('Failed to read /workspace/clip.mov'); + expect(result.output).toContain('404 route not found'); + }); + + it('reads text files as numbered lines through the same tool', async () => { + const text = 'hello\n'; const tool = makeReadMediaTool({ stat: vi.fn().mockResolvedValue({ ...DEFAULT_STAT, stSize: text.length }), - readBytes: vi.fn().mockResolvedValue(text), + readBytes: vi.fn().mockResolvedValue(Buffer.from(text)), + readLines: vi.fn().mockImplementation(async function* readLines() { + yield 'hello\n'; + }), }); const result = await executeTool(tool, { @@ -369,11 +432,8 @@ describe('ReadMediaFileTool', () => { signal, }); - expect(result.isError).toBe(true); - expect(result.output).toBe( - '"/workspace/sample.txt" is a text file. Use Read to read text files.', - ); - expect(result.output).not.toContain('ReadFile'); + expect(result.isError).toBeFalsy(); + expect(result.output).toContain('1\thello'); }); it('rejects unknown binary files without legacy Python-tool wording', async () => { @@ -392,11 +452,28 @@ describe('ReadMediaFileTool', () => { expect(result.isError).toBe(true); expect(result.output).toBe( - '"/workspace/blob.bin" is not a supported image or video file. Use Read for text files, or Bash or an MCP tool for other binary formats.', + '"/workspace/blob.bin" is not readable as UTF-8 text. ' + + 'For binary formats, use Bash or an MCP tool if available.', ); expect(result.output).not.toContain('Python tools'); }); + it('errors when the current model lacks image input capability', async () => { + const tool = makeReadMediaTool({ + modelCapabilities: capabilities({ image_in: false, video_in: true }), + }); + + const result = await executeTool(tool, { + turnId: 't1', + toolCallId: 'c_noimg', + args: { path: '/workspace/sample.png' }, + signal, + }); + + expect(result.isError).toBe(true); + expect(result.output).toMatch(/image input/i); + }); + it('errors when the current model lacks video input capability', async () => { const tool = makeReadMediaTool({ stat: vi.fn().mockResolvedValue({ @@ -419,28 +496,34 @@ describe('ReadMediaFileTool', () => { }); it('rejects empty files and files exceeding the media size limit', async () => { - const empty = await executeTool(makeReadMediaTool({ - stat: vi.fn().mockResolvedValue({ ...DEFAULT_STAT, stSize: 0 }), - }), { - turnId: 't1', - toolCallId: 'c_empty', - args: { path: '/workspace/empty.png' }, - signal, - }); + const empty = await executeTool( + makeReadMediaTool({ + stat: vi.fn().mockResolvedValue({ ...DEFAULT_STAT, stSize: 0 }), + }), + { + turnId: 't1', + toolCallId: 'c_empty', + args: { path: '/workspace/empty.png' }, + signal, + }, + ); expect(empty).toMatchObject({ isError: true }); expect(empty.output).toMatch(/empty/i); - const huge = await executeTool(makeReadMediaTool({ - stat: vi.fn().mockResolvedValue({ - ...DEFAULT_STAT, - stSize: 200 * 1024 * 1024, + const huge = await executeTool( + makeReadMediaTool({ + stat: vi.fn().mockResolvedValue({ + ...DEFAULT_STAT, + stSize: 200 * 1024 * 1024, + }), }), - }), { - turnId: 't1', - toolCallId: 'c_huge', - args: { path: '/workspace/huge.png' }, - signal, - }); + { + turnId: 't1', + toolCallId: 'c_huge', + args: { path: '/workspace/huge.png' }, + signal, + }, + ); expect(huge).toMatchObject({ isError: true }); expect(huge.output).toMatch(/exceeds|100/i); }); @@ -464,7 +547,7 @@ describe('ReadMediaFileTool', () => { readBytes: vi.fn().mockResolvedValue(png), }); - const result = await executeTool(tool,{ + const result = await executeTool(tool, { turnId: 't1', toolCallId: 'c_size', args: { path: '/workspace/valid.png' }, @@ -488,7 +571,7 @@ describe('ReadMediaFileTool', () => { readBytes: vi.fn().mockResolvedValue(data), }); - const result = await executeTool(tool,{ + const result = await executeTool(tool, { turnId: 't1', toolCallId: 'c_extless_msg', args: { path: '/workspace/sample' }, @@ -502,39 +585,9 @@ describe('ReadMediaFileTool', () => { expect(systemText).toContain(`${String(data.length)} bytes`); }); - it('description by capabilities lockdown — image + video points at Read for text fallback', () => { - const tool = new ReadMediaFileTool(createFakeKaos(), PERMISSIVE_WORKSPACE, capabilities()); - // Long-form description contract from sibling docs: 100MB ceiling and - // pointer to the text-file tool for non-media content. TS renames the - // sibling tool to `Read` (py was `ReadFile`). - expect(tool.description).toContain('100MB'); - expect(tool.description).toContain('Read tool'); - expect(tool.description).toContain('supports image and video files for the current model'); - }); - - it('omits the tool from the toolset when the model has neither image_in nor video_in', () => { - // Strict skip semantics: construction returns a sentinel the loader can - // use to drop the tool entirely, instead of registering a tool that - // always errors. Currently TS throws a regular Error — fail-unimplemented - // surfaces the gap. - let caught: unknown = null; - const construct = (): ReadMediaFileTool => - new ReadMediaFileTool( - createFakeKaos(), - PERMISSIVE_WORKSPACE, - capabilities({ image_in: false, video_in: false }), - ); - try { - construct(); - } catch (error) { - caught = error; - } - expect((caught as { name?: string } | null)?.name).toBe('SkipThisTool'); - }); - it('allows absolute media paths outside workspace but rejects relative escapes', async () => { const readBytes = vi.fn().mockResolvedValue(PNG_HEADER); - const tool = new ReadMediaFileTool( + const tool = new ReadTool( createFakeKaos({ stat: vi.fn().mockResolvedValue(DEFAULT_STAT), readBytes, @@ -562,3 +615,38 @@ describe('ReadMediaFileTool', () => { expect(readBytes).not.toHaveBeenCalled(); }); }); + +describe('ReadTool description by capabilities', () => { + function makeTool(caps: Partial): ReadTool { + return new ReadTool(createFakeKaos(), PERMISSIVE_WORKSPACE, capabilities(caps)); + } + + it('mentions image and video when both capabilities are present', () => { + const tool = makeTool({ image_in: true, video_in: true }); + expect(tool.description).toContain('supports image and video'); + }); + + it('mentions image but flags video unsupported when only image_in is present', () => { + const tool = makeTool({ image_in: true, video_in: false }); + expect(tool.description).toContain('supports image files for the current model'); + expect(tool.description).toContain('Video files are not supported'); + }); + + it('mentions video but flags image unsupported when only video_in is present', () => { + const tool = makeTool({ image_in: false, video_in: true }); + expect(tool.description).toContain('supports video files for the current model'); + expect(tool.description).toContain('Image files are not supported'); + }); + + it('declares media unsupported when neither capability is present', () => { + const tool = makeTool({ image_in: false, video_in: false }); + expect(tool.description).toContain('does not support image or video input'); + }); + + it('description pins the stable contract phrases: image+video, 100MB, parallel reads', () => { + const tool = makeTool({ image_in: true, video_in: true }); + expect(tool.description).toContain('image and video'); + expect(tool.description).toContain('100MB'); + expect(tool.description).toContain('parallel'); + }); +}); diff --git a/packages/agent-core/test/tools/read.test.ts b/packages/agent-core/test/tools/read.test.ts index 4ab97c1bb..c316a80af 100644 --- a/packages/agent-core/test/tools/read.test.ts +++ b/packages/agent-core/test/tools/read.test.ts @@ -11,7 +11,7 @@ import { } from '../../src/tools/builtin/file/read'; import { MEDIA_SNIFF_BYTES } from '../../src/tools/support/file-type'; import type { WorkspaceConfig } from '../../src/tools/support/workspace'; -import { createFakeKaos, PERMISSIVE_WORKSPACE, toolContentString } from './fixtures/fake-kaos'; +import { createFakeKaos, FULL_MEDIA_CAPABILITIES, PERMISSIVE_WORKSPACE, toolContentString } from './fixtures/fake-kaos'; import { executeTool } from './fixtures/execute-tool'; const signal = new AbortController().signal; @@ -76,6 +76,7 @@ function toolWithContent(content: string, workspace: WorkspaceConfig = PERMISSIV readText: vi.fn().mockResolvedValue(content), }), workspace, + FULL_MEDIA_CAPABILITIES, ); } @@ -220,7 +221,7 @@ describe('ReadTool', () => { const tool = new ReadTool(createFakeKaos({ readText }), { workspaceDir: '/workspace/project', additionalDirs: [], - }); + }, FULL_MEDIA_CAPABILITIES); const result = await executeTool(tool, context({ path: '../../outside.txt' })); @@ -246,6 +247,7 @@ describe('ReadTool', () => { workspaceDir: '/workspace', additionalDirs: [], }, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/tmp/external.txt' })); @@ -276,6 +278,7 @@ describe('ReadTool', () => { workspaceDir: '/workspace', additionalDirs: [], }, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/workspace/missing.txt' })); @@ -303,6 +306,7 @@ describe('ReadTool', () => { workspaceDir: '/workspace', additionalDirs: [], }, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/workspace/src' })); @@ -333,6 +337,7 @@ describe('ReadTool', () => { workspaceDir: '/workspace', additionalDirs: [], }, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '~/notes/today.txt' })); @@ -352,7 +357,7 @@ describe('ReadTool', () => { const tool = new ReadTool(createFakeKaos({ readText }), { workspaceDir: '/workspace', additionalDirs: [], - }); + }, FULL_MEDIA_CAPABILITIES); const result = await executeTool(tool, context({ path: '/workspace/.env' })); @@ -361,52 +366,59 @@ describe('ReadTool', () => { expect(readText).not.toHaveBeenCalled(); }); - it('rejects image files before text decoding and points to ReadMediaFile', async () => { + it('reads image files as media content without text decoding', async () => { const pngHeader = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]); const readText = vi .fn() .mockRejectedValue(new Error('readText should not be called for images')); const tool = new ReadTool( createFakeKaos({ - stat: vi.fn().mockResolvedValue(REGULAR_FILE_STAT), + stat: vi + .fn() + .mockResolvedValue({ ...REGULAR_FILE_STAT, stSize: pngHeader.length }), readBytes: vi.fn().mockResolvedValue(pngHeader), readText, }), PERMISSIVE_WORKSPACE, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/tmp/sample.png' })); - const output = toolContentString(result); - expect(result.isError).toBe(true); - expect(output).toMatch(/image file/i); - expect(output).toMatch(/ReadMediaFile|media/i); + expect(result.isError).toBeFalsy(); + expect(Array.isArray(result.output)).toBe(true); + const parts = result.output as { type: string }[]; + expect(parts.some((part) => part.type === 'image_url')).toBe(true); expect(readText).not.toHaveBeenCalled(); }); - it('rejects extensionless image files using magic-byte sniffing', async () => { + it('reads extensionless image files using magic-byte sniffing', async () => { const pngHeader = Buffer.from([0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a]); const readText = vi .fn() .mockRejectedValue(new Error('readText should not be called for extensionless images')); const tool = new ReadTool( createFakeKaos({ - stat: vi.fn().mockResolvedValue(REGULAR_FILE_STAT), + stat: vi + .fn() + .mockResolvedValue({ ...REGULAR_FILE_STAT, stSize: pngHeader.length }), readBytes: vi.fn().mockResolvedValue(pngHeader), readText, }), PERMISSIVE_WORKSPACE, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/tmp/sample' })); - const output = toolContentString(result); - expect(result.isError).toBe(true); - expect(output).toMatch(/image file/i); + expect(result.isError).toBeFalsy(); + expect(Array.isArray(result.output)).toBe(true); + const parts = result.output as { type: string }[]; + expect(parts.some((part) => part.type === 'image_url')).toBe(true); expect(readText).not.toHaveBeenCalled(); }); - it('rejects video files before text decoding', async () => { + it('reads video files as media content without text decoding', async () => { const mp4Header = Buffer.concat([ Buffer.from([0x00, 0x00, 0x00, 0x18]), Buffer.from('ftyp'), @@ -419,19 +431,22 @@ describe('ReadTool', () => { .mockRejectedValue(new Error('readText should not be called for videos')); const tool = new ReadTool( createFakeKaos({ - stat: vi.fn().mockResolvedValue(REGULAR_FILE_STAT), + stat: vi + .fn() + .mockResolvedValue({ ...REGULAR_FILE_STAT, stSize: mp4Header.length }), readBytes: vi.fn().mockResolvedValue(mp4Header), readText, }), PERMISSIVE_WORKSPACE, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/tmp/sample.mp4' })); - const output = toolContentString(result); - expect(result.isError).toBe(true); - expect(output).toMatch(/video file/i); - expect(output).toMatch(/ReadMediaFile|media/i); + expect(result.isError).toBeFalsy(); + expect(Array.isArray(result.output)).toBe(true); + const parts = result.output as { type: string }[]; + expect(parts.some((part) => part.type === 'video_url')).toBe(true); expect(readText).not.toHaveBeenCalled(); }); @@ -447,6 +462,7 @@ describe('ReadTool', () => { readText, }), PERMISSIVE_WORKSPACE, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/tmp/blob.bin' })); @@ -454,7 +470,7 @@ describe('ReadTool', () => { expect(result.isError).toBe(true); expect(output).toBe( - '"/tmp/blob.bin" is not readable as UTF-8 text. If it is an image or video, use ReadMediaFile. For other binary formats, use Bash or an MCP tool if available.', + '"/tmp/blob.bin" is not readable as UTF-8 text. For binary formats, use Bash or an MCP tool if available.', ); expect(output).not.toContain('Python tools'); expect(readText).not.toHaveBeenCalled(); @@ -473,6 +489,7 @@ describe('ReadTool', () => { readLines, }), PERMISSIVE_WORKSPACE, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/tmp/blob-with-late-nul' })); @@ -480,7 +497,7 @@ describe('ReadTool', () => { expect(result.isError).toBe(true); expect(output).toBe( - '"/tmp/blob-with-late-nul" is not readable as UTF-8 text. If it is an image or video, use ReadMediaFile. For other binary formats, use Bash or an MCP tool if available.', + '"/tmp/blob-with-late-nul" is not readable as UTF-8 text. For binary formats, use Bash or an MCP tool if available.', ); expect(output).not.toContain('Python tools'); }); @@ -503,6 +520,7 @@ describe('ReadTool', () => { readLines, }), PERMISSIVE_WORKSPACE, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/tmp/not-utf8.txt' })); @@ -510,7 +528,7 @@ describe('ReadTool', () => { expect(result.isError).toBe(true); expect(output).toBe( - '"/tmp/not-utf8.txt" is not readable as UTF-8 text. If it is an image or video, use ReadMediaFile. For other binary formats, use Bash or an MCP tool if available.', + '"/tmp/not-utf8.txt" is not readable as UTF-8 text. For binary formats, use Bash or an MCP tool if available.', ); expect(output).not.toContain('Python tools'); expect(output).not.toContain(replacement); @@ -556,6 +574,7 @@ describe('ReadTool', () => { readText: vi.fn().mockRejectedValue(new Error('full readText should not be called')), } as unknown as Partial), PERMISSIVE_WORKSPACE, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/tmp/acp.txt' })); @@ -594,6 +613,7 @@ describe('ReadTool', () => { readText, }), PERMISSIVE_WORKSPACE, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool, context({ path: '/tmp/large.txt' })); @@ -679,6 +699,7 @@ describe('ReadTool', () => { readLines: vi.fn().mockImplementation(readLinesFromContent(content)), }), { workspaceDir: '/workspace', additionalDirs: ['/extra'] }, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool,context({ path: '/extra/notes.txt' })); @@ -694,6 +715,7 @@ describe('ReadTool', () => { stat: vi.fn().mockRejectedValue(statError), }), { workspaceDir: '/workspace', additionalDirs: [] }, + FULL_MEDIA_CAPABILITIES, ); const result = await executeTool(tool,context({ path: '/workspace/ghost.txt' }));