2424# Maximum token limit for OpenAI embedding models
2525MAX_EMBEDDING_TOKENS = 8192
2626
27+ # Cached tiktoken encoders to avoid re-creating on every count_tokens call
28+ _tiktoken_encoder_cache = {}
29+
30+ def _get_encoder (model_name = None ):
31+ """Get or create a cached tiktoken encoder."""
32+ key = model_name or "default"
33+ if key not in _tiktoken_encoder_cache :
34+ if model_name :
35+ try :
36+ _tiktoken_encoder_cache [key ] = tiktoken .encoding_for_model (model_name )
37+ except KeyError :
38+ _tiktoken_encoder_cache [key ] = tiktoken .get_encoding ("cl100k_base" )
39+ else :
40+ _tiktoken_encoder_cache [key ] = tiktoken .get_encoding ("cl100k_base" )
41+ return _tiktoken_encoder_cache [key ]
42+
2743def count_tokens (text : str , embedder_type : str = None , is_ollama_embedder : bool = None ) -> int :
2844 """
2945 Count the number of tokens in a text string using tiktoken.
@@ -42,25 +58,25 @@ def count_tokens(text: str, embedder_type: str = None, is_ollama_embedder: bool
4258 # Handle backward compatibility
4359 if embedder_type is None and is_ollama_embedder is not None :
4460 embedder_type = 'ollama' if is_ollama_embedder else None
45-
61+
4662 # Determine embedder type if not specified
4763 if embedder_type is None :
4864 from api .config import get_embedder_type
4965 embedder_type = get_embedder_type ()
5066
51- # Choose encoding based on embedder type
67+ # Choose encoding based on embedder type (using cached encoders)
5268 if embedder_type == 'ollama' :
5369 # Ollama typically uses cl100k_base encoding
54- encoding = tiktoken . get_encoding ( "cl100k_base" )
70+ encoding = _get_encoder ( )
5571 elif embedder_type == 'google' :
5672 # Google uses similar tokenization to GPT models for rough estimation
57- encoding = tiktoken . get_encoding ( "cl100k_base" )
73+ encoding = _get_encoder ( )
5874 elif embedder_type == 'bedrock' :
5975 # Bedrock embedding models vary; use a common GPT-like encoding for rough estimation
60- encoding = tiktoken . get_encoding ( "cl100k_base" )
76+ encoding = _get_encoder ( )
6177 else : # OpenAI or default
6278 # Use OpenAI embedding model encoding
63- encoding = tiktoken . encoding_for_model ("text-embedding-3-small" )
79+ encoding = _get_encoder ("text-embedding-3-small" )
6480
6581 return len (encoding .encode (text ))
6682 except Exception as e :
@@ -69,6 +85,67 @@ def count_tokens(text: str, embedder_type: str = None, is_ollama_embedder: bool
6985 # Rough approximation: 4 characters per token
7086 return len (text ) // 4
7187
88+ # --- Large repo optimization constants ---
89+
90+ # Additional directories to exclude for large repos (merged with DEFAULT_EXCLUDED_DIRS)
91+ ADDITIONAL_EXCLUDED_DIRS = {
92+ 'vendor' , 'third_party' , 'external' , 'deps' ,
93+ '.next' , '.nuxt' , '.svelte-kit' , '.output' ,
94+ 'generated' , 'auto_generated' , 'codegen' ,
95+ 'fixtures' , 'testdata' , '__snapshots__' ,
96+ 'migrations' , 'dist' , 'build' , 'out' ,
97+ '.cache' , '.tmp' , '.temp' ,
98+ 'coverage' , '.nyc_output' ,
99+ 'bower_components' , 'jspm_packages' ,
100+ }
101+
102+ # File size limits — skip oversized files that are unlikely to be useful
103+ MAX_CODE_FILE_SIZE = 100_000 # 100KB - skip very large source files
104+ MAX_DOC_FILE_SIZE = 50_000 # 50KB - skip very large docs
105+
106+ # Total ingestion token budget — stop reading files once this is exhausted
107+ MAX_TOTAL_INGESTION_TOKENS = 2_000_000 # 2M token budget
108+
109+
110+ def _is_binary (file_path : str ) -> bool :
111+ """Check if file appears to be binary by looking for null bytes."""
112+ try :
113+ with open (file_path , 'rb' ) as f :
114+ chunk = f .read (512 )
115+ return b'\x00 ' in chunk
116+ except (IOError , OSError ):
117+ return True
118+
119+
120+ def _file_priority (file_path : str ) -> int :
121+ """
122+ Assign a priority score to a file path for processing order.
123+ Lower number = higher priority.
124+ """
125+ name = os .path .basename (file_path ).lower ()
126+ rel_path = file_path .lower ()
127+
128+ # Root docs are highest priority
129+ if name in ('readme.md' , 'readme.rst' , 'readme.txt' , 'readme' ):
130+ return 0
131+ if name in ('contributing.md' , 'architecture.md' , 'changelog.md' ):
132+ return 1
133+ # Config files
134+ if name in ('package.json' , 'cargo.toml' , 'pyproject.toml' , 'go.mod' ,
135+ 'pom.xml' , 'build.gradle' ):
136+ return 2
137+ # Source directories
138+ if any (d in rel_path for d in ('/src/' , '/lib/' , '/app/' , '/api/' , '/pkg/' , '/internal/' )):
139+ return 3
140+ # Documentation
141+ if any (d in rel_path for d in ('/docs/' , '/doc/' , '/documentation/' )):
142+ return 4
143+ # Test files
144+ if any (d in rel_path for d in ('/test/' , '/tests/' , '/spec/' , '/__tests__/' )):
145+ return 6
146+ return 5
147+
148+
72149def download_repo (repo_url : str , local_path : str , repo_type : str = None , access_token : str = None ) -> str :
73150 """
74151 Downloads a Git repository (GitHub, GitLab, or Bitbucket) to a specified local path.
@@ -125,7 +202,7 @@ def download_repo(repo_url: str, local_path: str, repo_type: str = None, access_
125202 logger .info (f"Cloning repository from { repo_url } to { local_path } " )
126203 # We use repo_url in the log to avoid exposing the token in logs
127204 result = subprocess .run (
128- ["git" , "clone" , "--depth=1" , "--single-branch" , clone_url , local_path ],
205+ ["git" , "clone" , "--depth=1" , "--single-branch" , "--filter=blob:limit=1m" , clone_url , local_path ],
129206 check = True ,
130207 stdout = subprocess .PIPE ,
131208 stderr = subprocess .PIPE ,
@@ -205,6 +282,9 @@ def read_all_documents(path: str, embedder_type: str = None, is_ollama_embedder:
205282 final_excluded_dirs = set (DEFAULT_EXCLUDED_DIRS )
206283 final_excluded_files = set (DEFAULT_EXCLUDED_FILES )
207284
285+ # Merge in the additional excluded dirs for large-repo optimization
286+ final_excluded_dirs .update (ADDITIONAL_EXCLUDED_DIRS )
287+
208288 # Add any additional excluded directories from config
209289 if "file_filters" in configs and "excluded_dirs" in configs ["file_filters" ]:
210290 final_excluded_dirs .update (configs ["file_filters" ]["excluded_dirs" ])
@@ -301,82 +381,103 @@ def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List
301381
302382 return not is_excluded
303383
304- # Process code files first
305- for ext in code_extensions :
384+ # Collect all candidate files first, then sort by priority before processing
385+ all_extensions = set (code_extensions + doc_extensions )
386+ code_ext_set = set (code_extensions )
387+
388+ candidate_files = []
389+ for ext in all_extensions :
306390 files = glob .glob (f"{ path } /**/*{ ext } " , recursive = True )
307391 for file_path in files :
308392 # Check if file should be processed based on inclusion/exclusion rules
309393 if not should_process_file (file_path , use_inclusion_mode , included_dirs , included_files , excluded_dirs , excluded_files ):
310394 continue
395+ candidate_files .append (file_path )
396+
397+ # Sort by priority so the most important files are processed first
398+ candidate_files .sort (key = lambda fp : _file_priority (os .path .relpath (fp , path )))
399+ logger .info (f"Found { len (candidate_files )} candidate files after filtering (sorted by priority)" )
311400
401+ # Process files with cumulative token budget enforcement
402+ cumulative_tokens = 0
403+
404+ for file_path in candidate_files :
405+ ext = os .path .splitext (file_path )[1 ].lower ()
406+ is_code = ext in code_ext_set
407+ relative_path = os .path .relpath (file_path , path )
408+
409+ try :
410+ # --- File size filter ---
312411 try :
313- with open (file_path , "r" , encoding = "utf-8" ) as f :
314- content = f .read ()
315- relative_path = os .path .relpath (file_path , path )
316-
317- # Determine if this is an implementation file
318- is_implementation = (
319- not relative_path .startswith ("test_" )
320- and not relative_path .startswith ("app_" )
321- and "test" not in relative_path .lower ()
322- )
412+ file_size = os .path .getsize (file_path )
413+ except OSError :
414+ continue
415+ size_limit = MAX_CODE_FILE_SIZE if is_code else MAX_DOC_FILE_SIZE
416+ if file_size > size_limit :
417+ logger .debug (f"Skipping oversized file { relative_path } : { file_size } bytes > { size_limit } " )
418+ continue
323419
324- # Check token count
325- token_count = count_tokens (content , embedder_type )
326- if token_count > MAX_EMBEDDING_TOKENS * 10 :
327- logger .warning (f"Skipping large file { relative_path } : Token count ({ token_count } ) exceeds limit" )
328- continue
329-
330- doc = Document (
331- text = content ,
332- meta_data = {
333- "file_path" : relative_path ,
334- "type" : ext [1 :],
335- "is_code" : True ,
336- "is_implementation" : is_implementation ,
337- "title" : relative_path ,
338- "token_count" : token_count ,
339- },
340- )
341- documents .append (doc )
342- except Exception as e :
343- logger .error (f"Error reading { file_path } : { e } " )
420+ # --- Binary file detection ---
421+ if _is_binary (file_path ):
422+ logger .debug (f"Skipping binary file { relative_path } " )
423+ continue
344424
345- # Then process documentation files
346- for ext in doc_extensions :
347- files = glob .glob (f"{ path } /**/*{ ext } " , recursive = True )
348- for file_path in files :
349- # Check if file should be processed based on inclusion/exclusion rules
350- if not should_process_file (file_path , use_inclusion_mode , included_dirs , included_files , excluded_dirs , excluded_files ):
425+ with open (file_path , "r" , encoding = "utf-8" ) as f :
426+ content = f .read ()
427+
428+ # --- Per-file token limit (existing logic preserved) ---
429+ token_count = count_tokens (content , embedder_type )
430+ max_tokens_for_file = MAX_EMBEDDING_TOKENS * 10 if is_code else MAX_EMBEDDING_TOKENS
431+ if token_count > max_tokens_for_file :
432+ logger .warning (f"Skipping large file { relative_path } : Token count ({ token_count } ) exceeds limit" )
351433 continue
352434
353- try :
354- with open (file_path , "r" , encoding = "utf-8" ) as f :
355- content = f .read ()
356- relative_path = os .path .relpath (file_path , path )
357-
358- # Check token count
359- token_count = count_tokens (content , embedder_type )
360- if token_count > MAX_EMBEDDING_TOKENS :
361- logger .warning (f"Skipping large file { relative_path } : Token count ({ token_count } ) exceeds limit" )
362- continue
363-
364- doc = Document (
365- text = content ,
366- meta_data = {
367- "file_path" : relative_path ,
368- "type" : ext [1 :],
369- "is_code" : False ,
370- "is_implementation" : False ,
371- "title" : relative_path ,
372- "token_count" : token_count ,
373- },
374- )
375- documents .append (doc )
376- except Exception as e :
377- logger .error (f"Error reading { file_path } : { e } " )
435+ # --- Cumulative token budget ---
436+ cumulative_tokens += token_count
437+ if cumulative_tokens > MAX_TOTAL_INGESTION_TOKENS :
438+ logger .warning (
439+ f"Token budget exhausted ({ cumulative_tokens } > { MAX_TOTAL_INGESTION_TOKENS } ). "
440+ f"Stopping file ingestion at { relative_path } . { len (documents )} documents collected so far."
441+ )
442+ break
443+
444+ if is_code :
445+ # Determine if this is an implementation file
446+ is_implementation = (
447+ not relative_path .startswith ("test_" )
448+ and not relative_path .startswith ("app_" )
449+ and "test" not in relative_path .lower ()
450+ )
451+
452+ doc = Document (
453+ text = content ,
454+ meta_data = {
455+ "file_path" : relative_path ,
456+ "type" : ext [1 :],
457+ "is_code" : True ,
458+ "is_implementation" : is_implementation ,
459+ "title" : relative_path ,
460+ "token_count" : token_count ,
461+ },
462+ )
463+ else :
464+ doc = Document (
465+ text = content ,
466+ meta_data = {
467+ "file_path" : relative_path ,
468+ "type" : ext [1 :],
469+ "is_code" : False ,
470+ "is_implementation" : False ,
471+ "title" : relative_path ,
472+ "token_count" : token_count ,
473+ },
474+ )
475+
476+ documents .append (doc )
477+ except Exception as e :
478+ logger .error (f"Error reading { file_path } : { e } " )
378479
379- logger .info (f"Found { len (documents )} documents" )
480+ logger .info (f"Found { len (documents )} documents (cumulative tokens: { cumulative_tokens } ) " )
380481 return documents
381482
382483def prepare_data_pipeline (embedder_type : str = None , is_ollama_embedder : bool = None ):
0 commit comments