Skip to content

Commit 5a610fa

Browse files
shantanu patilshantanu patil
authored andcommitted
Merge branch 'worktree-agent-a0599a3b'
2 parents 73a0e4b + 21e6762 commit 5a610fa

2 files changed

Lines changed: 174 additions & 73 deletions

File tree

api/config/embedder.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535
},
3636
"text_splitter": {
3737
"split_by": "word",
38-
"chunk_size": 350,
39-
"chunk_overlap": 100
38+
"chunk_size": 800,
39+
"chunk_overlap": 200
4040
}
4141
}

api/data_pipeline.py

Lines changed: 172 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,22 @@
2424
# Maximum token limit for OpenAI embedding models
2525
MAX_EMBEDDING_TOKENS = 8192
2626

27+
# Cached tiktoken encoders to avoid re-creating on every count_tokens call
28+
_tiktoken_encoder_cache = {}
29+
30+
def _get_encoder(model_name=None):
31+
"""Get or create a cached tiktoken encoder."""
32+
key = model_name or "default"
33+
if key not in _tiktoken_encoder_cache:
34+
if model_name:
35+
try:
36+
_tiktoken_encoder_cache[key] = tiktoken.encoding_for_model(model_name)
37+
except KeyError:
38+
_tiktoken_encoder_cache[key] = tiktoken.get_encoding("cl100k_base")
39+
else:
40+
_tiktoken_encoder_cache[key] = tiktoken.get_encoding("cl100k_base")
41+
return _tiktoken_encoder_cache[key]
42+
2743
def count_tokens(text: str, embedder_type: str = None, is_ollama_embedder: bool = None) -> int:
2844
"""
2945
Count the number of tokens in a text string using tiktoken.
@@ -42,25 +58,25 @@ def count_tokens(text: str, embedder_type: str = None, is_ollama_embedder: bool
4258
# Handle backward compatibility
4359
if embedder_type is None and is_ollama_embedder is not None:
4460
embedder_type = 'ollama' if is_ollama_embedder else None
45-
61+
4662
# Determine embedder type if not specified
4763
if embedder_type is None:
4864
from api.config import get_embedder_type
4965
embedder_type = get_embedder_type()
5066

51-
# Choose encoding based on embedder type
67+
# Choose encoding based on embedder type (using cached encoders)
5268
if embedder_type == 'ollama':
5369
# Ollama typically uses cl100k_base encoding
54-
encoding = tiktoken.get_encoding("cl100k_base")
70+
encoding = _get_encoder()
5571
elif embedder_type == 'google':
5672
# Google uses similar tokenization to GPT models for rough estimation
57-
encoding = tiktoken.get_encoding("cl100k_base")
73+
encoding = _get_encoder()
5874
elif embedder_type == 'bedrock':
5975
# Bedrock embedding models vary; use a common GPT-like encoding for rough estimation
60-
encoding = tiktoken.get_encoding("cl100k_base")
76+
encoding = _get_encoder()
6177
else: # OpenAI or default
6278
# Use OpenAI embedding model encoding
63-
encoding = tiktoken.encoding_for_model("text-embedding-3-small")
79+
encoding = _get_encoder("text-embedding-3-small")
6480

6581
return len(encoding.encode(text))
6682
except Exception as e:
@@ -69,6 +85,67 @@ def count_tokens(text: str, embedder_type: str = None, is_ollama_embedder: bool
6985
# Rough approximation: 4 characters per token
7086
return len(text) // 4
7187

88+
# --- Large repo optimization constants ---
89+
90+
# Additional directories to exclude for large repos (merged with DEFAULT_EXCLUDED_DIRS)
91+
ADDITIONAL_EXCLUDED_DIRS = {
92+
'vendor', 'third_party', 'external', 'deps',
93+
'.next', '.nuxt', '.svelte-kit', '.output',
94+
'generated', 'auto_generated', 'codegen',
95+
'fixtures', 'testdata', '__snapshots__',
96+
'migrations', 'dist', 'build', 'out',
97+
'.cache', '.tmp', '.temp',
98+
'coverage', '.nyc_output',
99+
'bower_components', 'jspm_packages',
100+
}
101+
102+
# File size limits — skip oversized files that are unlikely to be useful
103+
MAX_CODE_FILE_SIZE = 100_000 # 100KB - skip very large source files
104+
MAX_DOC_FILE_SIZE = 50_000 # 50KB - skip very large docs
105+
106+
# Total ingestion token budget — stop reading files once this is exhausted
107+
MAX_TOTAL_INGESTION_TOKENS = 2_000_000 # 2M token budget
108+
109+
110+
def _is_binary(file_path: str) -> bool:
111+
"""Check if file appears to be binary by looking for null bytes."""
112+
try:
113+
with open(file_path, 'rb') as f:
114+
chunk = f.read(512)
115+
return b'\x00' in chunk
116+
except (IOError, OSError):
117+
return True
118+
119+
120+
def _file_priority(file_path: str) -> int:
121+
"""
122+
Assign a priority score to a file path for processing order.
123+
Lower number = higher priority.
124+
"""
125+
name = os.path.basename(file_path).lower()
126+
rel_path = file_path.lower()
127+
128+
# Root docs are highest priority
129+
if name in ('readme.md', 'readme.rst', 'readme.txt', 'readme'):
130+
return 0
131+
if name in ('contributing.md', 'architecture.md', 'changelog.md'):
132+
return 1
133+
# Config files
134+
if name in ('package.json', 'cargo.toml', 'pyproject.toml', 'go.mod',
135+
'pom.xml', 'build.gradle'):
136+
return 2
137+
# Source directories
138+
if any(d in rel_path for d in ('/src/', '/lib/', '/app/', '/api/', '/pkg/', '/internal/')):
139+
return 3
140+
# Documentation
141+
if any(d in rel_path for d in ('/docs/', '/doc/', '/documentation/')):
142+
return 4
143+
# Test files
144+
if any(d in rel_path for d in ('/test/', '/tests/', '/spec/', '/__tests__/')):
145+
return 6
146+
return 5
147+
148+
72149
def download_repo(repo_url: str, local_path: str, repo_type: str = None, access_token: str = None) -> str:
73150
"""
74151
Downloads a Git repository (GitHub, GitLab, or Bitbucket) to a specified local path.
@@ -125,7 +202,7 @@ def download_repo(repo_url: str, local_path: str, repo_type: str = None, access_
125202
logger.info(f"Cloning repository from {repo_url} to {local_path}")
126203
# We use repo_url in the log to avoid exposing the token in logs
127204
result = subprocess.run(
128-
["git", "clone", "--depth=1", "--single-branch", clone_url, local_path],
205+
["git", "clone", "--depth=1", "--single-branch", "--filter=blob:limit=1m", clone_url, local_path],
129206
check=True,
130207
stdout=subprocess.PIPE,
131208
stderr=subprocess.PIPE,
@@ -205,6 +282,9 @@ def read_all_documents(path: str, embedder_type: str = None, is_ollama_embedder:
205282
final_excluded_dirs = set(DEFAULT_EXCLUDED_DIRS)
206283
final_excluded_files = set(DEFAULT_EXCLUDED_FILES)
207284

285+
# Merge in the additional excluded dirs for large-repo optimization
286+
final_excluded_dirs.update(ADDITIONAL_EXCLUDED_DIRS)
287+
208288
# Add any additional excluded directories from config
209289
if "file_filters" in configs and "excluded_dirs" in configs["file_filters"]:
210290
final_excluded_dirs.update(configs["file_filters"]["excluded_dirs"])
@@ -301,82 +381,103 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List
301381

302382
return not is_excluded
303383

304-
# Process code files first
305-
for ext in code_extensions:
384+
# Collect all candidate files first, then sort by priority before processing
385+
all_extensions = set(code_extensions + doc_extensions)
386+
code_ext_set = set(code_extensions)
387+
388+
candidate_files = []
389+
for ext in all_extensions:
306390
files = glob.glob(f"{path}/**/*{ext}", recursive=True)
307391
for file_path in files:
308392
# Check if file should be processed based on inclusion/exclusion rules
309393
if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
310394
continue
395+
candidate_files.append(file_path)
396+
397+
# Sort by priority so the most important files are processed first
398+
candidate_files.sort(key=lambda fp: _file_priority(os.path.relpath(fp, path)))
399+
logger.info(f"Found {len(candidate_files)} candidate files after filtering (sorted by priority)")
311400

401+
# Process files with cumulative token budget enforcement
402+
cumulative_tokens = 0
403+
404+
for file_path in candidate_files:
405+
ext = os.path.splitext(file_path)[1].lower()
406+
is_code = ext in code_ext_set
407+
relative_path = os.path.relpath(file_path, path)
408+
409+
try:
410+
# --- File size filter ---
312411
try:
313-
with open(file_path, "r", encoding="utf-8") as f:
314-
content = f.read()
315-
relative_path = os.path.relpath(file_path, path)
316-
317-
# Determine if this is an implementation file
318-
is_implementation = (
319-
not relative_path.startswith("test_")
320-
and not relative_path.startswith("app_")
321-
and "test" not in relative_path.lower()
322-
)
412+
file_size = os.path.getsize(file_path)
413+
except OSError:
414+
continue
415+
size_limit = MAX_CODE_FILE_SIZE if is_code else MAX_DOC_FILE_SIZE
416+
if file_size > size_limit:
417+
logger.debug(f"Skipping oversized file {relative_path}: {file_size} bytes > {size_limit}")
418+
continue
323419

324-
# Check token count
325-
token_count = count_tokens(content, embedder_type)
326-
if token_count > MAX_EMBEDDING_TOKENS * 10:
327-
logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit")
328-
continue
329-
330-
doc = Document(
331-
text=content,
332-
meta_data={
333-
"file_path": relative_path,
334-
"type": ext[1:],
335-
"is_code": True,
336-
"is_implementation": is_implementation,
337-
"title": relative_path,
338-
"token_count": token_count,
339-
},
340-
)
341-
documents.append(doc)
342-
except Exception as e:
343-
logger.error(f"Error reading {file_path}: {e}")
420+
# --- Binary file detection ---
421+
if _is_binary(file_path):
422+
logger.debug(f"Skipping binary file {relative_path}")
423+
continue
344424

345-
# Then process documentation files
346-
for ext in doc_extensions:
347-
files = glob.glob(f"{path}/**/*{ext}", recursive=True)
348-
for file_path in files:
349-
# Check if file should be processed based on inclusion/exclusion rules
350-
if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
425+
with open(file_path, "r", encoding="utf-8") as f:
426+
content = f.read()
427+
428+
# --- Per-file token limit (existing logic preserved) ---
429+
token_count = count_tokens(content, embedder_type)
430+
max_tokens_for_file = MAX_EMBEDDING_TOKENS * 10 if is_code else MAX_EMBEDDING_TOKENS
431+
if token_count > max_tokens_for_file:
432+
logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit")
351433
continue
352434

353-
try:
354-
with open(file_path, "r", encoding="utf-8") as f:
355-
content = f.read()
356-
relative_path = os.path.relpath(file_path, path)
357-
358-
# Check token count
359-
token_count = count_tokens(content, embedder_type)
360-
if token_count > MAX_EMBEDDING_TOKENS:
361-
logger.warning(f"Skipping large file {relative_path}: Token count ({token_count}) exceeds limit")
362-
continue
363-
364-
doc = Document(
365-
text=content,
366-
meta_data={
367-
"file_path": relative_path,
368-
"type": ext[1:],
369-
"is_code": False,
370-
"is_implementation": False,
371-
"title": relative_path,
372-
"token_count": token_count,
373-
},
374-
)
375-
documents.append(doc)
376-
except Exception as e:
377-
logger.error(f"Error reading {file_path}: {e}")
435+
# --- Cumulative token budget ---
436+
cumulative_tokens += token_count
437+
if cumulative_tokens > MAX_TOTAL_INGESTION_TOKENS:
438+
logger.warning(
439+
f"Token budget exhausted ({cumulative_tokens} > {MAX_TOTAL_INGESTION_TOKENS}). "
440+
f"Stopping file ingestion at {relative_path}. {len(documents)} documents collected so far."
441+
)
442+
break
443+
444+
if is_code:
445+
# Determine if this is an implementation file
446+
is_implementation = (
447+
not relative_path.startswith("test_")
448+
and not relative_path.startswith("app_")
449+
and "test" not in relative_path.lower()
450+
)
451+
452+
doc = Document(
453+
text=content,
454+
meta_data={
455+
"file_path": relative_path,
456+
"type": ext[1:],
457+
"is_code": True,
458+
"is_implementation": is_implementation,
459+
"title": relative_path,
460+
"token_count": token_count,
461+
},
462+
)
463+
else:
464+
doc = Document(
465+
text=content,
466+
meta_data={
467+
"file_path": relative_path,
468+
"type": ext[1:],
469+
"is_code": False,
470+
"is_implementation": False,
471+
"title": relative_path,
472+
"token_count": token_count,
473+
},
474+
)
475+
476+
documents.append(doc)
477+
except Exception as e:
478+
logger.error(f"Error reading {file_path}: {e}")
378479

379-
logger.info(f"Found {len(documents)} documents")
480+
logger.info(f"Found {len(documents)} documents (cumulative tokens: {cumulative_tokens})")
380481
return documents
381482

382483
def prepare_data_pipeline(embedder_type: str = None, is_ollama_embedder: bool = None):

0 commit comments

Comments
 (0)