Skip to content

Commit 709a5e1

Browse files
committed
fix: address PR #1889 review feedback for vision support
- Guard against decompression bombs in ResizeImage by rejecting decoded images exceeding 20000x20000 pixels before processing - Fix image stripping for models with unknown capabilities: only strip images when modalities are explicitly known and exclude image input - Check file existence before calling IsImageFile in read_file handler to provide clearer errors and avoid type detection on missing files Assisted-By: cagent
1 parent 186fa91 commit 709a5e1

5 files changed

Lines changed: 101 additions & 29 deletions

File tree

pkg/app/app.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -408,8 +408,10 @@ func (a *App) processFileAttachment(ctx context.Context, att messages.Attachment
408408
}
409409
resized, resizeErr := chat.ResizeImage(imgData, mimeType)
410410
if resizeErr != nil {
411-
slog.Warn("image resize failed, sending original", "path", absPath, "error", resizeErr)
412-
resized = &chat.ImageResizeResult{Data: imgData, MimeType: mimeType}
411+
// Don't bypass security checks - reject the file if resize failed
412+
slog.Warn("skipping attachment: image resize failed", "path", absPath, "error", resizeErr)
413+
a.sendEvent(ctx, runtime.Warning(fmt.Sprintf("Skipped attachment %s: %s", att.Name, resizeErr), ""))
414+
return
413415
}
414416
dataURL := fmt.Sprintf("data:%s;base64,%s", resized.MimeType, base64.StdEncoding.EncodeToString(resized.Data))
415417
*binaryParts = append(*binaryParts, chat.MessagePart{

pkg/chat/image.go

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ const (
2222
// MaxImageBytes is the maximum file size for images sent to LLM providers (4.5MB,
2323
// below Anthropic's 5MB limit).
2424
MaxImageBytes = 4_500_000
25+
// maxDecodedDimension is the absolute upper bound on decoded image dimensions.
26+
// Images exceeding this are rejected before processing to guard against
27+
// decompression bombs (small files that expand to huge pixel buffers).
28+
maxDecodedDimension = 20_000
2529
// jpegQuality is the default JPEG encoding quality.
2630
jpegQuality = 80
2731
)
@@ -75,6 +79,13 @@ func ResizeImage(data []byte, mimeType string) (*ImageResizeResult, error) {
7579
bounds := img.Bounds()
7680
origW, origH := bounds.Dx(), bounds.Dy()
7781

82+
// Guard against decompression bombs: reject images whose decoded
83+
// dimensions are absurdly large. A small compressed file can expand
84+
// to hundreds of megabytes in memory (e.g. 20000×20000×4 ≈ 1.6 GB).
85+
if origW > maxDecodedDimension || origH > maxDecodedDimension {
86+
return nil, fmt.Errorf("image dimensions too large: %dx%d (max %d)", origW, origH, maxDecodedDimension)
87+
}
88+
7889
// If the image already fits within all limits, return unchanged.
7990
if origW <= MaxImageDimension && origH <= MaxImageDimension && len(data) <= MaxImageBytes {
8091
return &ImageResizeResult{
@@ -103,6 +114,7 @@ func ResizeImage(data []byte, mimeType string) (*ImageResizeResult, error) {
103114
for _, q := range []int{70, 55, 40} {
104115
encoded, err := encodeJPEG(resized, q)
105116
if err != nil {
117+
slog.Debug("JPEG encoding failed", "quality", q, "error", err)
106118
continue
107119
}
108120

@@ -131,6 +143,7 @@ func ResizeImage(data []byte, mimeType string) (*ImageResizeResult, error) {
131143
for _, q := range []int{80, 55, 40} {
132144
encoded, err := encodeJPEG(smaller, q)
133145
if err != nil {
146+
slog.Debug("JPEG encoding failed", "quality", q, "scale", scale, "error", err)
134147
continue
135148
}
136149

@@ -150,8 +163,7 @@ func ResizeImage(data []byte, mimeType string) (*ImageResizeResult, error) {
150163
}
151164

152165
if len(best) > MaxImageBytes {
153-
slog.Warn("Image still exceeds size limit after all resize attempts",
154-
"original_size", len(data), "final_size", len(best), "limit", MaxImageBytes)
166+
return nil, fmt.Errorf("image exceeds size limit after all resize attempts: %d bytes (limit %d)", len(best), MaxImageBytes)
155167
}
156168

157169
return &ImageResizeResult{
@@ -166,34 +178,51 @@ func ResizeImage(data []byte, mimeType string) (*ImageResizeResult, error) {
166178
}
167179

168180
// ResizeImageBase64 is a convenience wrapper around ResizeImage that accepts
169-
// and returns base64-encoded image data.
170-
func ResizeImageBase64(b64Data, mimeType string) (*ImageResizeResult, error) {
181+
// and returns base64-encoded image data. The base64-encoded result is returned
182+
// separately to avoid mutating the ImageResizeResult.Data field.
183+
func ResizeImageBase64(b64Data, mimeType string) (b64Result string, metadata *ImageResizeResult, err error) {
171184
raw, err := base64.StdEncoding.DecodeString(b64Data)
172185
if err != nil {
173-
return nil, fmt.Errorf("decode base64: %w", err)
186+
return "", nil, fmt.Errorf("decode base64: %w", err)
174187
}
175188
result, err := ResizeImage(raw, mimeType)
176189
if err != nil {
177-
return nil, err
190+
return "", nil, err
178191
}
179-
result.Data = []byte(base64.StdEncoding.EncodeToString(result.Data))
180-
return result, nil
192+
return base64.StdEncoding.EncodeToString(result.Data), result, nil
181193
}
182194

183195
// FormatDimensionNote produces a human-readable note describing the resize mapping.
184196
// This helps the model translate coordinates from the resized image back to the original.
197+
//
198+
// Because ResizeImage uses fitDimensions (which preserves aspect ratio), the X and
199+
// Y scale factors are always equal in practice. If they ever differ (e.g. because
200+
// the function is called with a manually constructed ImageResizeResult), we emit
201+
// separate per-axis factors so that coordinate mapping remains correct.
185202
func FormatDimensionNote(r *ImageResizeResult) string {
186203
if !r.Resized {
187204
return ""
188205
}
189206
scaleX := float64(r.OriginalWidth) / float64(r.Width)
190207
scaleY := float64(r.OriginalHeight) / float64(r.Height)
191-
scale := scaleX
192-
if scaleY > scaleX {
193-
scale = scaleY
208+
209+
// Uniform scaling (the normal path): a single factor suffices.
210+
const epsilon = 0.01
211+
if abs(scaleX-scaleY) < epsilon {
212+
return fmt.Sprintf("[Image: original %dx%d, displayed at %dx%d. Multiply coordinates by %.2f to map to original image.]",
213+
r.OriginalWidth, r.OriginalHeight, r.Width, r.Height, scaleX)
214+
}
215+
216+
// Non-uniform scaling: provide separate X and Y factors.
217+
return fmt.Sprintf("[Image: original %dx%d, displayed at %dx%d. Multiply X coordinates by %.2f and Y coordinates by %.2f to map to original image.]",
218+
r.OriginalWidth, r.OriginalHeight, r.Width, r.Height, scaleX, scaleY)
219+
}
220+
221+
func abs(x float64) float64 {
222+
if x < 0 {
223+
return -x
194224
}
195-
return fmt.Sprintf("[Image: original %dx%d, displayed at %dx%d. Multiply coordinates by %.2f to map to original image.]",
196-
r.OriginalWidth, r.OriginalHeight, r.Width, r.Height, scale)
225+
return x
197226
}
198227

199228
// fitDimensions returns new dimensions that fit within maxW×maxH while

pkg/chat/image_test.go

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ func TestFormatDimensionNote(t *testing.T) {
146146
assert.Empty(t, FormatDimensionNote(result))
147147
})
148148

149-
t.Run("resized", func(t *testing.T) {
149+
t.Run("uniform scaling", func(t *testing.T) {
150150
t.Parallel()
151151
result := &ImageResizeResult{
152152
Resized: true,
@@ -158,7 +158,26 @@ func TestFormatDimensionNote(t *testing.T) {
158158
note := FormatDimensionNote(result)
159159
assert.Contains(t, note, "original 4000x3000")
160160
assert.Contains(t, note, "displayed at 2000x1500")
161-
assert.Contains(t, note, "2.00")
161+
assert.Contains(t, note, "Multiply coordinates by 2.00")
162+
// Should NOT contain separate X/Y factors.
163+
assert.NotContains(t, note, "X coordinates")
164+
})
165+
166+
t.Run("non-uniform scaling", func(t *testing.T) {
167+
t.Parallel()
168+
// Manually constructed result with different X and Y scale factors.
169+
result := &ImageResizeResult{
170+
Resized: true,
171+
OriginalWidth: 4000,
172+
OriginalHeight: 2000,
173+
Width: 2000,
174+
Height: 2000,
175+
}
176+
note := FormatDimensionNote(result)
177+
assert.Contains(t, note, "original 4000x2000")
178+
assert.Contains(t, note, "displayed at 2000x2000")
179+
assert.Contains(t, note, "Multiply X coordinates by 2.00")
180+
assert.Contains(t, note, "Y coordinates by 1.00")
162181
})
163182
}
164183

@@ -168,9 +187,11 @@ func TestResizeImageBase64(t *testing.T) {
168187
data := createTestPNG(t, 100, 100)
169188
b64 := base64.StdEncoding.EncodeToString(data)
170189

171-
result, err := ResizeImageBase64(b64, "image/png")
190+
b64Result, result, err := ResizeImageBase64(b64, "image/png")
172191
require.NoError(t, err)
173192
assert.False(t, result.Resized)
174-
// Result.Data is base64-encoded
175-
assert.Equal(t, b64, string(result.Data))
193+
// Result data stays as raw bytes
194+
assert.Equal(t, data, result.Data)
195+
// Base64 result is returned separately
196+
assert.Equal(t, b64, b64Result)
176197
}

pkg/runtime/runtime.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1121,7 +1121,7 @@ func (r *LocalRuntime) RunStream(ctx context.Context, sess *session.Session) <-c
11211121
// Strip image content from messages if the model doesn't support image input.
11221122
// This prevents API errors when conversation history contains images (e.g. from
11231123
// tool results or user attachments) but the current model is text-only.
1124-
if m != nil && !slices.Contains(m.Modalities.Input, "image") {
1124+
if m != nil && len(m.Modalities.Input) > 0 && !slices.Contains(m.Modalities.Input, "image") {
11251125
messages = stripImageContent(messages)
11261126
}
11271127

pkg/tools/builtin/filesystem.go

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -520,12 +520,8 @@ func (t *FilesystemTool) handleListDirectory(_ context.Context, args ListDirecto
520520
func (t *FilesystemTool) handleReadFile(_ context.Context, args ReadFileArgs) (*tools.ToolCallResult, error) {
521521
resolvedPath := t.resolvePath(args.Path)
522522

523-
// Check if the file is an image
524-
if chat.IsImageFile(resolvedPath) {
525-
return t.readImageFile(resolvedPath, args.Path)
526-
}
527-
528-
content, err := os.ReadFile(resolvedPath)
523+
// Check if the file exists before any type detection.
524+
info, err := os.Stat(resolvedPath)
529525
if err != nil {
530526
var errMsg string
531527
if os.IsNotExist(err) {
@@ -543,6 +539,22 @@ func (t *FilesystemTool) handleReadFile(_ context.Context, args ReadFileArgs) (*
543539
}, nil
544540
}
545541

542+
// Only check for image files on regular files (not directories, etc.)
543+
if info.Mode().IsRegular() && chat.IsImageFile(resolvedPath) {
544+
return t.readImageFile(resolvedPath, args.Path)
545+
}
546+
547+
content, err := os.ReadFile(resolvedPath)
548+
if err != nil {
549+
return &tools.ToolCallResult{
550+
Output: err.Error(),
551+
IsError: true,
552+
Meta: ReadFileMeta{
553+
Error: err.Error(),
554+
},
555+
}, nil
556+
}
557+
546558
return &tools.ToolCallResult{
547559
Output: string(content),
548560
Meta: ReadFileMeta{
@@ -575,8 +587,16 @@ func (t *FilesystemTool) readImageFile(resolvedPath, originalPath string) (*tool
575587
// Resize the image if it exceeds provider limits (max 2000×2000, max 4.5MB).
576588
resized, err := chat.ResizeImage(data, mimeType)
577589
if err != nil {
578-
// If resize fails, fall back to sending the original.
579-
slog.Warn("Image resize failed, sending original", "path", originalPath, "error", err)
590+
// Check if the original exceeds limits before falling back
591+
if len(data) > chat.MaxImageBytes {
592+
return &tools.ToolCallResult{
593+
Output: fmt.Sprintf("Error: Image file too large (%d bytes, max %d bytes)", len(data), chat.MaxImageBytes),
594+
IsError: true,
595+
Meta: ReadFileMeta{Path: originalPath, Error: "image too large"},
596+
}, nil
597+
}
598+
// Original is within limits, proceed with fallback
599+
slog.Warn("Image resize failed, sending original (within limits)", "path", originalPath, "error", err)
580600
encoded := base64.StdEncoding.EncodeToString(data)
581601
return &tools.ToolCallResult{
582602
Output: fmt.Sprintf("Read image file %s [%s] (%d bytes)", originalPath, mimeType, len(data)),

0 commit comments

Comments
 (0)