Skip to content

Commit c3ff3e7

Browse files
authored
Merge pull request #982 from krissetto/make-rag-strats-respect-gitignore
RAG: Allow `respect_vcs` config
2 parents 7ad7368 + 43744b7 commit c3ff3e7

15 files changed

Lines changed: 652 additions & 263 deletions

cagent-schema.json

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -709,6 +709,11 @@
709709
"type": "string"
710710
}
711711
},
712+
"respect_vcs": {
713+
"type": "boolean",
714+
"description": "Whether to respect VCS ignore files (e.g., .gitignore) when collecting documents for indexing. When true (default), files matching ignore patterns will be excluded. Can be overridden per-strategy.",
715+
"default": true
716+
},
712717
"strategies": {
713718
"type": "array",
714719
"description": "Array of retrieval strategy configurations. Each strategy can have different parameters based on its type.",
@@ -831,6 +836,10 @@
831836
"minimum": 1,
832837
"default": 3
833838
},
839+
"respect_vcs": {
840+
"type": "boolean",
841+
"description": "Override the RAG-level respect_vcs setting for this strategy only."
842+
},
834843
"chat_model": {
835844
"type": "string",
836845
"description": "Chat model used to generate semantic representations for each chunk (semantic-embeddings only, required)",

docs/USAGE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,7 @@ models:
974974
|-------|------|-------------|
975975
| `docs` | []string | Document paths/directories (shared across strategies) |
976976
| `description` | string | Human-readable description |
977+
| `respect_vcs` | boolean | Whether to respect VCS ignore files like .gitignore (default: `true`) |
977978
| `strategies` | []object | Array of strategy configurations |
978979
| `results` | object | Post-processing configuration |
979980

@@ -983,6 +984,7 @@ models:
983984
- `database`: Database configuration (path to local sqlite db)
984985
- `chunking`: Chunking configuration
985986
- `limit`: Max results from this strategy (for fusion)
987+
- `respect_vcs`: Override RAG-level VCS ignore setting for this strategy only (optional)
986988

987989
**Chunked-Embeddings Strategy Parameters:**
988990
- `model` (required): Embedding model reference

pkg/config/latest/types.go

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -309,12 +309,21 @@ type RAGToolConfig struct {
309309
// RAGConfig represents a RAG (Retrieval-Augmented Generation) configuration
310310
// Uses a unified strategies array for flexible, extensible configuration
311311
type RAGConfig struct {
312-
Tool RAGToolConfig `json:"tool,omitempty"` // Tool configuration
313-
Docs []string `json:"docs,omitempty"` // Shared documents across all strategies
314-
Strategies []RAGStrategyConfig `json:"strategies,omitempty"` // Array of strategy configurations
312+
Tool RAGToolConfig `json:"tool,omitempty"` // Tool configuration
313+
Docs []string `json:"docs,omitempty"` // Shared documents across all strategies
314+
RespectVCS *bool `json:"respect_vcs,omitempty"` // Whether to respect VCS ignore files like .gitignore (default: true)
315+
Strategies []RAGStrategyConfig `json:"strategies,omitempty"` // Array of strategy configurations
315316
Results RAGResultsConfig `json:"results,omitempty"`
316317
}
317318

319+
// GetRespectVCS returns whether VCS ignore files should be respected, defaulting to true
320+
func (c *RAGConfig) GetRespectVCS() bool {
321+
if c.RespectVCS == nil {
322+
return true
323+
}
324+
return *c.RespectVCS
325+
}
326+
318327
// RAGStrategyConfig represents a single retrieval strategy configuration
319328
// Strategy-specific fields are stored in Params (validated by strategy implementation)
320329
type RAGStrategyConfig struct {

pkg/fsx/collect.go

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
package fsx
2+
3+
import (
4+
"fmt"
5+
"os"
6+
"path/filepath"
7+
"strings"
8+
9+
"github.com/bmatcuk/doublestar/v4"
10+
)
11+
12+
// CollectFiles recursively collects all files from given paths.
13+
// Supports glob patterns (via doublestar), directories, and individual files.
14+
// Skips paths that don't exist instead of returning an error.
15+
// Optional shouldIgnore filter can exclude files/directories (return true to skip).
16+
func CollectFiles(paths []string, shouldIgnore func(path string) bool) ([]string, error) {
17+
var files []string
18+
seen := make(map[string]bool)
19+
20+
for _, pattern := range paths {
21+
expanded, err := expandPattern(pattern)
22+
if err != nil {
23+
return nil, err
24+
}
25+
if len(expanded) == 0 {
26+
expanded = []string{pattern}
27+
}
28+
29+
for _, entry := range expanded {
30+
normalized := normalizePath(entry)
31+
32+
// Check if this path should be ignored
33+
if shouldIgnore != nil && shouldIgnore(normalized) {
34+
continue
35+
}
36+
37+
info, err := os.Stat(normalized)
38+
if err != nil {
39+
if os.IsNotExist(err) {
40+
continue
41+
}
42+
return nil, fmt.Errorf("failed to stat %s: %w", entry, err)
43+
}
44+
45+
if info.IsDir() {
46+
// Use DirectoryTree to collect files from directory
47+
tree, err := DirectoryTree(normalized, func(string) error { return nil }, shouldIgnore, 0, 0)
48+
if err != nil {
49+
return nil, fmt.Errorf("failed to read directory %s: %w", normalized, err)
50+
}
51+
// Traverse tree and collect absolute file paths
52+
var dirFiles []string
53+
CollectFilesFromTree(tree, filepath.Dir(normalized), &dirFiles)
54+
for _, f := range dirFiles {
55+
absPath := normalizePath(f)
56+
if !seen[absPath] {
57+
files = append(files, absPath)
58+
seen[absPath] = true
59+
}
60+
}
61+
continue
62+
}
63+
64+
if !seen[normalized] {
65+
files = append(files, normalized)
66+
seen[normalized] = true
67+
}
68+
}
69+
}
70+
71+
return files, nil
72+
}
73+
74+
// Matches reports whether the given path matches any configured path or glob pattern.
75+
// Useful for file watchers to determine if a changed file matches configured patterns.
76+
func Matches(path string, patterns []string) (bool, error) {
77+
if len(patterns) == 0 {
78+
return false, nil
79+
}
80+
81+
cleanPath := normalizePath(path)
82+
83+
for _, pattern := range patterns {
84+
if pattern == "" {
85+
continue
86+
}
87+
88+
normalizedPattern := normalizePath(pattern)
89+
90+
if hasGlob(pattern) {
91+
match, err := doublestar.PathMatch(normalizedPattern, cleanPath)
92+
if err != nil {
93+
return false, fmt.Errorf("invalid glob pattern %q: %w", pattern, err)
94+
}
95+
if match {
96+
return true, nil
97+
}
98+
continue
99+
}
100+
101+
info, err := os.Stat(normalizedPattern)
102+
if err != nil {
103+
if os.IsNotExist(err) {
104+
continue
105+
}
106+
return false, fmt.Errorf("failed to stat %s: %w", normalizedPattern, err)
107+
}
108+
109+
if info.IsDir() {
110+
if cleanPath == normalizedPattern || strings.HasPrefix(cleanPath, normalizedPattern+string(os.PathSeparator)) {
111+
return true, nil
112+
}
113+
continue
114+
}
115+
116+
if cleanPath == normalizedPattern {
117+
return true, nil
118+
}
119+
}
120+
121+
return false, nil
122+
}
123+
124+
func expandPattern(pattern string) ([]string, error) {
125+
if !hasGlob(pattern) {
126+
return []string{normalizePath(pattern)}, nil
127+
}
128+
129+
matches, err := doublestar.FilepathGlob(pattern)
130+
if err != nil {
131+
return nil, fmt.Errorf("invalid glob pattern %q: %w", pattern, err)
132+
}
133+
134+
results := make([]string, 0, len(matches))
135+
for _, match := range matches {
136+
results = append(results, normalizePath(match))
137+
}
138+
139+
return results, nil
140+
}
141+
142+
func hasGlob(pattern string) bool {
143+
return strings.ContainsAny(pattern, "*?[")
144+
}
145+
146+
func normalizePath(p string) string {
147+
if abs, err := filepath.Abs(p); err == nil {
148+
return filepath.Clean(abs)
149+
}
150+
return filepath.Clean(p)
151+
}

0 commit comments

Comments
 (0)