docker
diff --git a/‎Dockerfile‎
Lines changed: 36 additions & 7 deletions b/‎Dockerfile‎
Lines changed: 36 additions & 7 deletions
diff --git a/‎cagent-schema.json‎
Lines changed: 5 additions & 1 deletion b/‎cagent-schema.json‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/USAGE.md‎
Lines changed: 24 additions & 0 deletions b/‎docs/USAGE.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎go.mod‎
Lines changed: 1 addition & 0 deletions b/‎go.mod‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎go.sum‎
Lines changed: 2 additions & 0 deletions b/‎go.sum‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pkg/config/latest/types.go‎
Lines changed: 12 additions & 0 deletions b/‎pkg/config/latest/types.go‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎pkg/rag/chunk/chunk.go‎
Lines changed: 52 additions & 33 deletions b/‎pkg/rag/chunk/chunk.go‎
Lines changed: 52 additions & 33 deletions
@@ -6,31 +6,60 @@ ARG ALPINE_VERSION="3.22"
 # xx is a helper for cross-compilation
 FROM --platform=$BUILDPLATFORM tonistiigi/xx:1.7.0 AS xx
 
+# osxcross contains the MacOSX cross toolchain for xx
+FROM crazymax/osxcross:15.5-debian AS osxcross
+
 FROM --platform=$BUILDPLATFORM golang:${GO_VERSION}-alpine${ALPINE_VERSION} AS builder-base
 COPY --from=xx / /
 WORKDIR /src
 RUN --mount=type=cache,target=/go/pkg/mod \
     --mount=type=bind,source=go.mod,target=go.mod \
     --mount=type=bind,source=go.sum,target=go.sum \
     go mod download
+ENV CGO_ENABLED=1
+
 
-FROM builder-base AS builder
-COPY . ./
 ARG GIT_TAG
 ARG GIT_COMMIT
 ARG TARGETPLATFORM
 ARG TARGETOS
 ARG TARGETARCH
-RUN --mount=type=cache,target=/root/.cache,id=docker-ai-$TARGETPLATFORM \
+
+FROM builder-base AS builder-darwin
+RUN apk add clang
+COPY . ./
+RUN --mount=type=bind,from=osxcross,src=/osxsdk,target=/xx-sdk \
+    --mount=type=cache,target=/root/.cache,id=docker-ai-$TARGETPLATFORM \
     --mount=type=cache,target=/go/pkg/mod <<EOT
     set -ex
     xx-go build -trimpath -ldflags "-s -w -X 'github.com/docker/cagent/pkg/version.Version=$GIT_TAG' -X 'github.com/docker/cagent/pkg/version.Commit=$GIT_COMMIT'" -o /binaries/cagent-$TARGETOS-$TARGETARCH .
-    xx-verify --static /binaries/cagent-$TARGETOS-$TARGETARCH
-    if [ "$TARGETOS" = "windows" ]; then
-      mv /binaries/cagent-$TARGETOS-$TARGETARCH /binaries/cagent-$TARGETOS-$TARGETARCH.exe
-    fi
+    xx-verify --static /binaries/cagent-darwin-$TARGETARCH
 EOT
 
+FROM builder-base AS builder-linux
+RUN apk add clang
+RUN xx-apk add musl-dev gcc
+COPY . ./
+RUN --mount=type=cache,target=/root/.cache,id=docker-ai-$TARGETPLATFORM \
+    --mount=type=cache,target=/go/pkg/mod <<EOT
+    set -ex
+    xx-go build -trimpath -ldflags "-s -w -linkmode=external -extldflags '-static' -X 'github.com/docker/cagent/pkg/version.Version=$GIT_TAG' -X 'github.com/docker/cagent/pkg/version.Commit=$GIT_COMMIT'" -o /binaries/cagent-$TARGETOS-$TARGETARCH .
+    xx-verify --static /binaries/cagent-linux-$TARGETARCH
+EOT
+
+FROM builder-base AS builder-windows
+RUN apk add zig build-base
+COPY . ./
+RUN --mount=type=cache,target=/root/.cache,id=docker-ai-$TARGETPLATFORM \
+    --mount=type=cache,target=/go/pkg/mod <<EOT
+    set -ex
+    CC="zig cc -target x86_64-windows-gnu" CXX="zig c++ -target x86_64-windows-gnu" xx-go build -trimpath -ldflags "-s -w -X 'github.com/docker/cagent/pkg/version.Version=$GIT_TAG' -X 'github.com/docker/cagent/pkg/version.Commit=$GIT_COMMIT'" -o /binaries/cagent-$TARGETOS-$TARGETARCH .
+    mv /binaries/cagent-$TARGETOS-$TARGETARCH /binaries/cagent-$TARGETOS-$TARGETARCH.exe
+    xx-verify --static /binaries/cagent-windows-$TARGETARCH.exe
+EOT
+
+FROM builder-$TARGETOS AS builder
+
 FROM scratch AS local
 ARG TARGETOS TARGETARCH
 COPY --from=builder /binaries/cagent-$TARGETOS-$TARGETARCH cagent
 
@@ -728,7 +728,7 @@
                   "chunked-embeddings"
                 ]
               },
-              "model": {
+              "embedding_model": {
                 "type": "string",
                 "description": "Embedding model reference for chunked-embeddings strategies (looked up in models map, or 'auto' for automatic selection)",
                 "examples": [
@@ -804,6 +804,10 @@
                   "respect_word_boundaries": {
                     "type": "boolean",
                     "description": "When true, chunks will split on the nearest whitespace boundary instead of at the exact character limit, preventing words from being truncated."
+                  },
+                  "code_aware": {
+                    "type": "boolean",
+                    "description": "Enable code-aware chunking for source files. When true, the chunking strategy will prefer AST-based or language-aware processors when available (tree-sitter based), and fall back to plain text chunking for unsupported languages."
                   }
                 },
                 "additionalProperties": false
 
@@ -972,6 +972,30 @@ models:
 - `chunking.size`: Chunk size in characters (default: `1000`)
 - `chunking.overlap`: Overlap between chunks (default: `75`)
 
+**Code-Aware Chunking:**
+
+When indexing source code, you can enable code-aware chunking to produce semantically aligned chunks based on the code's AST (Abstract Syntax Tree). This keeps functions and methods intact rather than splitting them arbitrarily:
+
+```yaml
+rag:
+  codebase:
+    docs: [./src]
+    strategies:
+      - type: bm25
+        database: ./code.db
+        chunking:
+          size: 2000
+          code_aware: true  # Enable AST-based chunking
+```
+
+- `chunking.code_aware`: When `true`, uses tree-sitter for AST-based chunking (default: `false`), and `size` becomes indicative
+
+**Notes:**
+- Currently supports **Go** source files (`.go`). More languages will be added incrementally.
+- Falls back to plain text chunking for unsupported file types.
+- Produces chunks that align with code structure (functions, methods, type declarations).
+- Particularly useful for code search and retrieval tasks.
+
 **Results:**
 - `limit`: Final number of results (default: `15`)
 - `deduplicate`: Remove duplicates (default: `true`)
 
@@ -33,6 +33,7 @@ require (
 	github.com/mattn/go-runewidth v0.0.19
 	github.com/modelcontextprotocol/go-sdk v1.1.0
 	github.com/openai/openai-go/v3 v3.8.1
+	github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82
 	github.com/spf13/cobra v1.10.1
 	github.com/stretchr/testify v1.11.1
 	github.com/temoto/robotstxt v1.1.2
 
@@ -240,6 +240,8 @@ github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ
 github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ=
 github.com/skeema/knownhosts v1.3.1 h1:X2osQ+RAjK76shCbvhHHHVl3ZlgDm8apHEHFqRjnBY8=
 github.com/skeema/knownhosts v1.3.1/go.mod h1:r7KTdC8l4uxWRyK2TpQZ/1o5HaSzh06ePQNxPwTcfiY=
+github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82 h1:6C8qej6f1bStuePVkLSFxoU22XBS165D3klxlzRg8F4=
+github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82/go.mod h1:xe4pgH49k4SsmkQq5OT8abwhWmnzkhpgnXeekbx2efw=
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d h1:zE9ykElWQ6/NYmHa3jpm/yHnI4xSofP+UP6SpjHcSeM=
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/goconvey v1.6.4 h1:fv0U8FUIMPNf1L9lnHLvLhgicrIVChEkdzIKYqbNC9s=
 
@@ -518,6 +518,13 @@ func unmarshalChunkingConfig(src any, dst *RAGChunkingConfig) {
 			dst.RespectWordBoundaries = val
 		}
 	}
+
+	// Handle code_aware - YAML should give us a bool
+	if ca, ok := m["code_aware"]; ok {
+		if val, ok := ca.(bool); ok {
+			dst.CodeAware = val
+		}
+	}
 }
 
 // coerceToInt converts various numeric types to int
@@ -579,6 +586,11 @@ type RAGChunkingConfig struct {
 	Size                  int  `json:"size,omitempty"`
 	Overlap               int  `json:"overlap,omitempty"`
 	RespectWordBoundaries bool `json:"respect_word_boundaries,omitempty"`
+	// CodeAware enables code-aware chunking for source files. When true, the
+	// chunking strategy uses tree-sitter for AST-based chunking, producing
+	// semantically aligned chunks (e.g., whole functions). Falls back to
+	// plain text chunking for unsupported languages.
+	CodeAware bool `json:"code_aware,omitempty"`
 }
 
 // UnmarshalYAML implements custom unmarshaling to apply sensible defaults for chunking
 
@@ -19,26 +19,21 @@ type Chunk struct {
 	Metadata map[string]string
 }
 
-// Processor handles document processing
-type Processor struct{}
-
-// New creates a new document processor
-func New() *Processor {
-	return &Processor{}
+// DocumentProcessor takes file content and returns chunks.
+// Config (size, overlap, etc.) is set at construction time.
+type DocumentProcessor interface {
+	Process(path string, content []byte) ([]Chunk, error)
 }
 
-// ProcessFile reads a file and splits it into chunks
-func (p *Processor) ProcessFile(path string, chunkSize, overlap int, respectWordBoundaries bool) ([]Chunk, error) {
-	content, err := os.ReadFile(path)
-	if err != nil {
-		return nil, fmt.Errorf("failed to read file: %w", err)
-	}
-
-	return p.ChunkText(string(content), chunkSize, overlap, respectWordBoundaries), nil
+// TextDocumentProcessor is the default text-based chunker
+type TextDocumentProcessor struct {
+	size                  int
+	overlap               int
+	respectWordBoundaries bool
 }
 
-// ChunkText splits text into overlapping chunks
-func (p *Processor) ChunkText(text string, size, overlap int, respectWordBoundaries bool) []Chunk {
+// NewTextDocumentProcessor creates a text-based document processor
+func NewTextDocumentProcessor(size, overlap int, respectWordBoundaries bool) *TextDocumentProcessor {
 	if size <= 0 {
 		size = 1000
 	}
@@ -48,7 +43,20 @@ func (p *Processor) ChunkText(text string, size, overlap int, respectWordBoundar
 	if overlap >= size {
 		overlap = size / 2
 	}
+	return &TextDocumentProcessor{
+		size:                  size,
+		overlap:               overlap,
+		respectWordBoundaries: respectWordBoundaries,
+	}
+}
 
+// Process implements DocumentProcessor for text-based chunking
+func (t *TextDocumentProcessor) Process(_ string, content []byte) ([]Chunk, error) {
+	return t.chunkText(string(content)), nil
+}
+
+// chunkText splits text into overlapping chunks
+func (t *TextDocumentProcessor) chunkText(text string) []Chunk {
 	var chunks []Chunk
 	runes := []rune(text)
 	totalLen := len(runes)
@@ -62,7 +70,7 @@ func (p *Processor) ChunkText(text string, size, overlap int, respectWordBoundar
 
 	for start < totalLen {
 		// Calculate end position (start + size, but not beyond document end)
-		end := start + size
+		end := start + t.size
 		if end > totalLen {
 			end = totalLen
 		}
@@ -72,14 +80,14 @@ func (p *Processor) ChunkText(text string, size, overlap int, respectWordBoundar
 		// For the final chunk (end == totalLen) we always take the remainder
 		// of the document as-is to avoid generating progressively smaller
 		// tail chunks.
-		if respectWordBoundaries && end > start && end < totalLen {
+		if t.respectWordBoundaries && end > start && end < totalLen {
 			// Limit search to the current chunk window.
 			target := end
 
 			// Backtrack from target to find whitespace; if none is found
 			// in a reasonable range, keep the original end so that we
 			// still make progress even for very long "words".
-			searchEnd := p.findNearestWhitespace(runes[start:target+1], target-start) + start
+			searchEnd := t.findNearestWhitespace(runes[start:target+1], target-start) + start
 			if searchEnd > start && searchEnd < end {
 				end = searchEnd
 			}
@@ -100,7 +108,7 @@ func (p *Processor) ChunkText(text string, size, overlap int, respectWordBoundar
 		}
 
 		// Next chunk starts at the end of the previous chunk minus overlap
-		nextStart := end - overlap
+		nextStart := end - t.overlap
 
 		// CRITICAL: Ensure we always make forward progress
 		// If nextStart would move us backward or keep us in place, advance by at least 1
@@ -111,14 +119,14 @@ func (p *Processor) ChunkText(text string, size, overlap int, respectWordBoundar
 		// When respecting word boundaries, make sure the next chunk
 		// does not start in the middle of a word. Move the start
 		// forward to the next whitespace, then to the next non-whitespace.
-		if respectWordBoundaries {
+		if t.respectWordBoundaries {
 			// Move forward until we hit whitespace or end-of-text
-			for nextStart < totalLen && !p.isWhitespace(runes[nextStart]) {
+			for nextStart < totalLen && !t.isWhitespace(runes[nextStart]) {
 				nextStart++
 			}
 			// Skip the whitespace itself so we start at the first character
 			// of the next word (if any).
-			for nextStart < totalLen && p.isWhitespace(runes[nextStart]) {
+			for nextStart < totalLen && t.isWhitespace(runes[nextStart]) {
 				nextStart++
 			}
 		}
@@ -131,7 +139,7 @@ func (p *Processor) ChunkText(text string, size, overlap int, respectWordBoundar
 
 // findNearestWhitespace finds the nearest whitespace boundary to the target position
 // It searches backward first (within a reasonable distance), then forward if needed
-func (p *Processor) findNearestWhitespace(runes []rune, target int) int {
+func (t *TextDocumentProcessor) findNearestWhitespace(runes []rune, target int) int {
 	// Don't search beyond 20% of the total length in either direction
 	maxSearchDistance := len(runes) / 5
 	if maxSearchDistance < 50 {
@@ -144,9 +152,9 @@ func (p *Processor) findNearestWhitespace(runes []rune, target int) int {
 	// Search backward first (prefer to keep chunks slightly smaller)
 	for i := 0; i < maxSearchDistance && target-i > 0; i++ {
 		pos := target - i
-		if p.isWhitespace(runes[pos]) {
+		if t.isWhitespace(runes[pos]) {
 			// Skip consecutive whitespace
-			for pos > 0 && p.isWhitespace(runes[pos-1]) {
+			for pos > 0 && t.isWhitespace(runes[pos-1]) {
 				pos--
 			}
 			return pos
@@ -156,7 +164,7 @@ func (p *Processor) findNearestWhitespace(runes []rune, target int) int {
 	// Search forward if no whitespace found backward
 	for i := 1; i < maxSearchDistance && target+i < len(runes); i++ {
 		pos := target + i
-		if p.isWhitespace(runes[pos]) {
+		if t.isWhitespace(runes[pos]) {
 			return pos
 		}
 	}
@@ -166,12 +174,23 @@ func (p *Processor) findNearestWhitespace(runes []rune, target int) int {
 }
 
 // isWhitespace checks if a rune is whitespace
-func (p *Processor) isWhitespace(r rune) bool {
+func (t *TextDocumentProcessor) isWhitespace(r rune) bool {
 	return r == ' ' || r == '\t' || r == '\n' || r == '\r'
 }
 
+// --- File utility functions (standalone, not tied to any processor) ---
+
+// ProcessFile reads a file and processes it using the given document processor
+func ProcessFile(dp DocumentProcessor, path string) ([]Chunk, error) {
+	content, err := os.ReadFile(path)
+	if err != nil {
+		return nil, fmt.Errorf("failed to read file: %w", err)
+	}
+	return dp.Process(path, content)
+}
+
 // FileHash calculates SHA256 hash of a file
-func (p *Processor) FileHash(path string) (string, error) {
+func FileHash(path string) (string, error) {
 	f, err := os.Open(path)
 	if err != nil {
 		return "", fmt.Errorf("failed to open file: %w", err)
@@ -188,12 +207,12 @@ func (p *Processor) FileHash(path string) (string, error) {
 
 // CollectFiles recursively collects all files from given paths
 // Skips paths that don't exist instead of returning an error
-func (p *Processor) CollectFiles(paths []string) ([]string, error) {
+func CollectFiles(paths []string) ([]string, error) {
 	var files []string
 	seen := make(map[string]bool)
 
 	for _, pattern := range paths {
-		expanded, err := p.expandPattern(pattern)
+		expanded, err := expandPattern(pattern)
 		if err != nil {
 			return nil, err
 		}
@@ -245,7 +264,7 @@ func (p *Processor) CollectFiles(paths []string) ([]string, error) {
 
 // Matches reports whether the given path matches any configured document path or glob pattern.
 // To be used in file watchers to determine if a new/changed file matches the glob patterns or not.
-func (p *Processor) Matches(path string, patterns []string) (bool, error) {
+func Matches(path string, patterns []string) (bool, error) {
 	if len(patterns) == 0 {
 		return false, nil
 	}
@@ -293,7 +312,7 @@ func (p *Processor) Matches(path string, patterns []string) (bool, error) {
 	return false, nil
 }
 
-func (p *Processor) expandPattern(pattern string) ([]string, error) {
+func expandPattern(pattern string) ([]string, error) {
 	if !hasGlob(pattern) {
 		return []string{normalizePath(pattern)}, nil
 	}
Original file line number	Diff line number	Diff line change
`@@ -518,6 +518,13 @@ func unmarshalChunkingConfig(src any, dst *RAGChunkingConfig) {`
`518`	`518`	`dst.RespectWordBoundaries = val`
`519`	`519`	`}`
`520`	`520`	`}`
	`521`	`+`
	`522`	`+ // Handle code_aware - YAML should give us a bool`
	`523`	`+ if ca, ok := m["code_aware"]; ok {`
	`524`	`+ if val, ok := ca.(bool); ok {`
	`525`	`+ dst.CodeAware = val`
	`526`	`+ }`
	`527`	`+ }`
`521`	`528`	`}`
`522`	`529`
`523`	`530`	`// coerceToInt converts various numeric types to int`
`@@ -579,6 +586,11 @@ type RAGChunkingConfig struct {`
`579`	`586`	Size int `json:"size,omitempty"`
`580`	`587`	Overlap int `json:"overlap,omitempty"`
`581`	`588`	RespectWordBoundaries bool `json:"respect_word_boundaries,omitempty"`
	`589`	`+ // CodeAware enables code-aware chunking for source files. When true, the`
	`590`	`+ // chunking strategy uses tree-sitter for AST-based chunking, producing`
	`591`	`+ // semantically aligned chunks (e.g., whole functions). Falls back to`
	`592`	`+ // plain text chunking for unsupported languages.`
	`593`	+ CodeAware bool `json:"code_aware,omitempty"`
`582`	`594`	`}`
`583`	`595`
`584`	`596`	`// UnmarshalYAML implements custom unmarshaling to apply sensible defaults for chunking`