Skip to content

Commit b836664

Browse files
shantanu patilclaude
authored andcommitted
Add Phase 9: robust large-repo ingestion with retry, backoff, and filtering
- ingest.py: add --delay, --retry-failed, --max-retries for reliable page generation - ingest.py: add --max-files, --include-dirs, --exclude-dirs for large repo filtering - google_embedder_client: exponential backoff (1-16s + jitter) on 429/503 errors - google_embedder_client: 200ms inter-batch cooldown to respect rate limits - prompts.py: improve diagram quality for large repos (3-5 node simplified diagrams) - data_pipeline/rag/websocket_wiki: propagate max_files filter through full pipeline - useWikiGeneration.ts: matching frontend prompt improvements - repos.json: trim to claude-code, gemini-cli, vigilant-sanderson Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b86b94b commit b836664

8 files changed

Lines changed: 350 additions & 73 deletions

File tree

api/data_pipeline.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -227,9 +227,10 @@ def download_repo(repo_url: str, local_path: str, repo_type: str = None, access_
227227
# Alias for backward compatibility
228228
download_github_repo = download_repo
229229

230-
def read_all_documents(path: str, embedder_type: str = None, is_ollama_embedder: bool = None,
230+
def read_all_documents(path: str, embedder_type: str = None, is_ollama_embedder: bool = None,
231231
excluded_dirs: List[str] = None, excluded_files: List[str] = None,
232-
included_dirs: List[str] = None, included_files: List[str] = None):
232+
included_dirs: List[str] = None, included_files: List[str] = None,
233+
max_files: int = 0):
233234
"""
234235
Recursively reads all documents in a directory and its subdirectories.
235236
@@ -247,6 +248,8 @@ def read_all_documents(path: str, embedder_type: str = None, is_ollama_embedder:
247248
When provided, only files in these directories will be processed.
248249
included_files (List[str], optional): List of file patterns to include exclusively.
249250
When provided, only files matching these patterns will be processed.
251+
max_files (int, optional): Maximum number of files to process (0 = unlimited).
252+
When set, only the top N files by priority are included after sorting.
250253
251254
Returns:
252255
list: A list of Document objects with metadata.
@@ -398,6 +401,11 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List
398401
candidate_files.sort(key=lambda fp: _file_priority(os.path.relpath(fp, path)))
399402
logger.info(f"Found {len(candidate_files)} candidate files after filtering (sorted by priority)")
400403

404+
# Apply max_files limit if set (keep only top N by priority)
405+
if max_files > 0 and len(candidate_files) > max_files:
406+
logger.info(f"Applying max_files limit: {max_files} (from {len(candidate_files)} candidates)")
407+
candidate_files = candidate_files[:max_files]
408+
401409
# Process files with cumulative token budget enforcement
402410
cumulative_tokens = 0
403411

@@ -823,7 +831,8 @@ def __init__(self):
823831
def prepare_database(self, repo_url_or_path: str, repo_type: str = None, access_token: str = None,
824832
embedder_type: str = None, is_ollama_embedder: bool = None,
825833
excluded_dirs: List[str] = None, excluded_files: List[str] = None,
826-
included_dirs: List[str] = None, included_files: List[str] = None) -> List[Document]:
834+
included_dirs: List[str] = None, included_files: List[str] = None,
835+
max_files: int = 0) -> List[Document]:
827836
"""
828837
Create a new database from the repository.
829838
@@ -839,6 +848,7 @@ def prepare_database(self, repo_url_or_path: str, repo_type: str = None, access_
839848
excluded_files (List[str], optional): List of file patterns to exclude from processing
840849
included_dirs (List[str], optional): List of directories to include exclusively
841850
included_files (List[str], optional): List of file patterns to include exclusively
851+
max_files (int, optional): Maximum number of files to process (0 = unlimited)
842852
843853
Returns:
844854
List[Document]: List of Document objects
@@ -850,7 +860,7 @@ def prepare_database(self, repo_url_or_path: str, repo_type: str = None, access_
850860
self.reset_database()
851861
self._create_repo(repo_url_or_path, repo_type, access_token)
852862
return self.prepare_db_index(embedder_type=embedder_type, excluded_dirs=excluded_dirs, excluded_files=excluded_files,
853-
included_dirs=included_dirs, included_files=included_files)
863+
included_dirs=included_dirs, included_files=included_files, max_files=max_files)
854864

855865
def reset_database(self):
856866
"""
@@ -929,9 +939,10 @@ def _create_repo(self, repo_url_or_path: str, repo_type: str = None, access_toke
929939
logger.error(f"Failed to create repository structure: {e}")
930940
raise
931941

932-
def prepare_db_index(self, embedder_type: str = None, is_ollama_embedder: bool = None,
942+
def prepare_db_index(self, embedder_type: str = None, is_ollama_embedder: bool = None,
933943
excluded_dirs: List[str] = None, excluded_files: List[str] = None,
934-
included_dirs: List[str] = None, included_files: List[str] = None) -> List[Document]:
944+
included_dirs: List[str] = None, included_files: List[str] = None,
945+
max_files: int = 0) -> List[Document]:
935946
"""
936947
Prepare the indexed database for the repository.
937948
@@ -944,6 +955,7 @@ def prepare_db_index(self, embedder_type: str = None, is_ollama_embedder: bool =
944955
excluded_files (List[str], optional): List of file patterns to exclude from processing
945956
included_dirs (List[str], optional): List of directories to include exclusively
946957
included_files (List[str], optional): List of file patterns to include exclusively
958+
max_files (int, optional): Maximum number of files to process (0 = unlimited)
947959
948960
Returns:
949961
List[Document]: List of Document objects
@@ -1003,7 +1015,8 @@ def _embedding_vector_length(doc: Document) -> int:
10031015
excluded_dirs=excluded_dirs,
10041016
excluded_files=excluded_files,
10051017
included_dirs=included_dirs,
1006-
included_files=included_files
1018+
included_files=included_files,
1019+
max_files=max_files,
10071020
)
10081021
self.db = transform_documents_and_save_to_db(
10091022
documents, self.repo_paths["save_db_file"], embedder_type=embedder_type

api/google_embedder_client.py

Lines changed: 101 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22

33
import os
44
import logging
5-
import backoff
5+
import random
6+
import time
67
from typing import Dict, Any, Optional, List, Sequence
78

89
from adalflow.core.model_client import ModelClient
@@ -54,16 +55,21 @@ def __init__(
5455
self,
5556
api_key: Optional[str] = None,
5657
env_api_key_name: str = "GOOGLE_API_KEY",
58+
inter_batch_delay: float = 0.2,
5759
):
5860
"""Initialize Google AI Embeddings client.
59-
61+
6062
Args:
6163
api_key: Google AI API key. If not provided, uses environment variable.
6264
env_api_key_name: Name of environment variable containing API key.
65+
inter_batch_delay: Seconds to sleep after each successful embedding
66+
API call to avoid burst-hitting rate limits (default: 0.2s).
67+
Set to 0 to disable.
6368
"""
6469
super().__init__()
6570
self._api_key = api_key
6671
self._env_api_key_name = env_api_key_name
72+
self._inter_batch_delay = inter_batch_delay
6773
self._initialize_client()
6874

6975
def _initialize_client(self):
@@ -205,27 +211,57 @@ def convert_inputs_to_api_kwargs(
205211

206212
return final_model_kwargs
207213

208-
@backoff.on_exception(
209-
backoff.expo,
210-
(Exception,), # Google AI may raise various exceptions
211-
max_time=5,
212-
)
214+
# Retry configuration for rate-limit and transient errors
215+
_MAX_RETRIES = 5
216+
_BASE_DELAY = 1.0 # seconds
217+
_MAX_DELAY = 16.0 # seconds (cap for exponential backoff)
218+
_JITTER_MAX = 1.0 # max random jitter in seconds
219+
220+
@staticmethod
221+
def _is_retryable(exc: Exception) -> bool:
222+
"""Return True if the exception indicates a retryable error (429 / 503)."""
223+
exc_str = str(exc).lower()
224+
# google.api_core.exceptions.ResourceExhausted (429)
225+
if "resourceexhausted" in type(exc).__name__.lower():
226+
return True
227+
# google.api_core.exceptions.ServiceUnavailable (503)
228+
if "serviceunavailable" in type(exc).__name__.lower():
229+
return True
230+
# Catch by HTTP status code mentions in the message
231+
if "429" in exc_str or "resource exhausted" in exc_str:
232+
return True
233+
if "503" in exc_str or "service unavailable" in exc_str:
234+
return True
235+
# google.generativeai may raise a generic exception wrapping these
236+
if hasattr(exc, "code"):
237+
code = getattr(exc, "code", None)
238+
if code in (429, 503):
239+
return True
240+
if hasattr(exc, "status_code"):
241+
status = getattr(exc, "status_code", None)
242+
if status in (429, 503):
243+
return True
244+
return False
245+
213246
def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED):
214-
"""Call Google AI embedding API.
215-
247+
"""Call Google AI embedding API with retry + exponential backoff.
248+
249+
Retries on 429 (ResourceExhausted) and 503 (ServiceUnavailable) errors
250+
with exponential backoff: 1s, 2s, 4s, 8s, 16s plus random jitter.
251+
216252
Args:
217253
api_kwargs: API parameters
218254
model_type: Should be ModelType.EMBEDDER
219-
255+
220256
Returns:
221257
Google AI embedding response
222258
"""
223259
if model_type != ModelType.EMBEDDER:
224260
raise ValueError(f"GoogleEmbedderClient only supports EMBEDDER model type")
225-
261+
226262
# DEBUG LOGGING (Simplified)
227263
log.info(f"DEBUG: GoogleEmbedderClient.call received api_kwargs keys: {list(api_kwargs.keys())}")
228-
264+
229265
safe_log_kwargs = {k: v for k, v in api_kwargs.items() if k not in {"content", "contents"}}
230266
if "content" in api_kwargs:
231267
safe_log_kwargs["content_chars"] = len(str(api_kwargs.get("content", "")))
@@ -236,28 +272,59 @@ def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINE
236272
except Exception:
237273
safe_log_kwargs["contents_count"] = None
238274
log.info("Google AI Embeddings call kwargs (sanitized): %s", safe_log_kwargs)
239-
240-
try:
241-
# Use embed_content for single text or batch embedding
242-
# CRITICAL FIX: Do not modify api_kwargs in place as it breaks backoff retries!
243-
call_kwargs = api_kwargs.copy()
244-
245-
if "content" in call_kwargs:
246-
# Single embedding
247-
response = genai.embed_content(**call_kwargs)
248-
elif "contents" in call_kwargs:
249-
# Batch embedding - Google AI supports batch natively
250-
contents = call_kwargs.pop("contents")
251-
# pass as 'content' argument which handles both single and batch in newer SDKs
252-
response = genai.embed_content(content=contents, **call_kwargs)
253-
else:
254-
raise ValueError(f"Either 'content' or 'contents' must be provided. Got kwargs: {list(api_kwargs.keys())}")
255-
256-
return response
257-
258-
except Exception as e:
259-
log.error(f"Error calling Google AI Embeddings API: {e}")
260-
raise
275+
276+
last_exception: Optional[Exception] = None
277+
278+
for attempt in range(self._MAX_RETRIES + 1):
279+
try:
280+
# CRITICAL FIX: Do not modify api_kwargs in place as it breaks retries!
281+
call_kwargs = api_kwargs.copy()
282+
283+
if "content" in call_kwargs:
284+
# Single embedding
285+
response = genai.embed_content(**call_kwargs)
286+
elif "contents" in call_kwargs:
287+
# Batch embedding - Google AI supports batch natively
288+
contents = call_kwargs.pop("contents")
289+
# pass as 'content' argument which handles both single and batch
290+
response = genai.embed_content(content=contents, **call_kwargs)
291+
else:
292+
raise ValueError(
293+
f"Either 'content' or 'contents' must be provided. "
294+
f"Got kwargs: {list(api_kwargs.keys())}"
295+
)
296+
297+
# Inter-batch cooldown to avoid burst-hitting rate limits
298+
if self._inter_batch_delay > 0:
299+
time.sleep(self._inter_batch_delay)
300+
301+
return response
302+
303+
except Exception as e:
304+
last_exception = e
305+
306+
if not self._is_retryable(e) or attempt >= self._MAX_RETRIES:
307+
log.error(
308+
"Google AI Embeddings API call failed (attempt %d/%d, non-retryable or max retries): %s",
309+
attempt + 1, self._MAX_RETRIES + 1, e,
310+
)
311+
raise
312+
313+
# Exponential backoff with jitter
314+
delay = min(self._BASE_DELAY * (2 ** attempt), self._MAX_DELAY)
315+
jitter = random.uniform(0, self._JITTER_MAX)
316+
sleep_time = delay + jitter
317+
318+
log.warning(
319+
"Google AI Embeddings API returned retryable error (attempt %d/%d): %s. "
320+
"Retrying in %.1fs ...",
321+
attempt + 1, self._MAX_RETRIES + 1, e, sleep_time,
322+
)
323+
time.sleep(sleep_time)
324+
325+
# Should not be reached, but just in case
326+
if last_exception:
327+
raise last_exception
261328

262329
async def acall(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED):
263330
"""Async call to Google AI embedding API.

api/prompts.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,14 +248,21 @@
248248
- The JSON must be valid — if you are unsure, omit it rather than produce invalid JSON
249249
250250
Rules for "simplifiedMermaidSource" (the simplified overview diagram):
251-
- MUST contain a valid Mermaid diagram with MAXIMUM 5-8 nodes
251+
- MUST contain a valid Mermaid diagram with MAXIMUM 3-5 nodes showing major subsystems
252252
- Show ONLY the highest-level architectural components (think "executive summary")
253253
- Use clear, short labels (2-4 words each, e.g., "User Interface", "API Layer", "Database")
254254
- Use simple relationships WITHOUT detailed edge labels (just arrows, no labels)
255255
- Collapse related sub-components into a single node (e.g., merge "Auth Service" + "User Service" into "Backend Services")
256256
- The simplified diagram must be immediately understandable at a glance by a non-technical person
257257
- If the full diagram already has 8 or fewer nodes, simplifiedMermaidSource can match mermaidSource
258258
- Do NOT include implementation details, file names, or technical jargon in the simplified version
259+
260+
Diagram complexity guidelines:
261+
- For repositories with many files (100+), focus diagrams on architectural subsystems and module boundaries rather than individual files
262+
- Group related components into logical clusters. A good diagram has 5-8 nodes at the top level
263+
- Each node should represent a subsystem or module, not an individual file
264+
- Show data flow and dependencies between modules, not class hierarchies within a single module
265+
- Prefer fewer, clearer nodes over many detailed ones — users can click nodes for details
259266
</structured_diagram_data>
260267
"""
261268

api/rag.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,8 @@ def _validate_and_filter_embeddings(self, documents: List) -> List:
345345

346346
def prepare_retriever(self, repo_url_or_path: str, type: str = "github", access_token: str = None,
347347
excluded_dirs: List[str] = None, excluded_files: List[str] = None,
348-
included_dirs: List[str] = None, included_files: List[str] = None):
348+
included_dirs: List[str] = None, included_files: List[str] = None,
349+
max_files: int = 0):
349350
"""
350351
Prepare the retriever for a repository.
351352
Will load database from local storage if available.
@@ -357,6 +358,7 @@ def prepare_retriever(self, repo_url_or_path: str, type: str = "github", access_
357358
excluded_files: Optional list of file patterns to exclude from processing
358359
included_dirs: Optional list of directories to include exclusively
359360
included_files: Optional list of file patterns to include exclusively
361+
max_files: Maximum number of files to process (0 = unlimited)
360362
"""
361363
self.initialize_db_manager()
362364
self.repo_url_or_path = repo_url_or_path
@@ -368,7 +370,8 @@ def prepare_retriever(self, repo_url_or_path: str, type: str = "github", access_
368370
excluded_dirs=excluded_dirs,
369371
excluded_files=excluded_files,
370372
included_dirs=included_dirs,
371-
included_files=included_files
373+
included_files=included_files,
374+
max_files=max_files,
372375
)
373376
logger.info(f"Loaded {len(self.transformed_docs)} documents for retrieval")
374377

api/websocket_wiki.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class ChatCompletionRequest(BaseModel):
6161
excluded_files: Optional[str] = Field(None, description="Comma-separated list of file patterns to exclude from processing")
6262
included_dirs: Optional[str] = Field(None, description="Comma-separated list of directories to include exclusively")
6363
included_files: Optional[str] = Field(None, description="Comma-separated list of file patterns to include exclusively")
64+
max_files: Optional[int] = Field(0, description="Maximum number of files to process for embedding (0 = unlimited)")
6465

6566
async def generate_with_retry(rag, query, context_docs, provider, model, language="en", max_retries=3):
6667
"""Generate content with retry and context reduction on failure.
@@ -178,8 +179,12 @@ async def handle_websocket_chat(websocket: WebSocket):
178179
included_files = [unquote(file_pattern) for file_pattern in request.included_files.split('\n') if file_pattern.strip()]
179180
logger.info(f"Using custom included files: {included_files}")
180181

182+
max_files = request.max_files or 0
183+
if max_files > 0:
184+
logger.info(f"Using max_files limit: {max_files}")
185+
181186
# Check for a cached RAG session (only when no custom file filters)
182-
has_custom_filters = any([excluded_dirs, excluded_files, included_dirs, included_files])
187+
has_custom_filters = any([excluded_dirs, excluded_files, included_dirs, included_files, max_files > 0])
183188
from api.config import get_embedder_type
184189
embedder_type = get_embedder_type()
185190
session_key = rag_session_manager.get_session_key(request.repo_url, embedder_type) if not has_custom_filters else None
@@ -195,7 +200,7 @@ async def handle_websocket_chat(websocket: WebSocket):
195200
else:
196201
# Create a new RAG instance
197202
request_rag = RAG(provider=request.provider, model=request.model)
198-
request_rag.prepare_retriever(request.repo_url, request.type, request.token, excluded_dirs, excluded_files, included_dirs, included_files)
203+
request_rag.prepare_retriever(request.repo_url, request.type, request.token, excluded_dirs, excluded_files, included_dirs, included_files, max_files=max_files)
199204
# Cache the session if no custom filters were used
200205
if session_key:
201206
rag_session_manager.put(session_key, request_rag)

0 commit comments

Comments
 (0)