@@ -209,18 +209,29 @@ class ASTAnalyzer:
209209 - Dependency tracking
210210 - Semantic chunking (preserve boundaries)
211211 - Cross-reference analysis
212+ - Tree cache for parsed ASTs (avoids re-parsing unchanged files)
212213 """
213214
214- def __init__ (self , use_tree_sitter : bool = True ):
215+ def __init__ (self , use_tree_sitter : bool = True , use_tree_cache : bool = True ):
215216 """
216217 Initialize AST analyzer.
217218
218219 Args:
219220 use_tree_sitter: Use tree-sitter when available (fallback to ast module)
221+ use_tree_cache: Cache parsed trees for unchanged files (mtime-based invalidation)
220222 """
221223 self .use_tree_sitter = use_tree_sitter and _TS_AVAILABLE
222224 self ._parsers : Dict [str , Any ] = {}
223225
226+ # Tree cache for avoiding re-parsing unchanged files
227+ self ._tree_cache = None
228+ if use_tree_cache :
229+ try :
230+ from scripts .ingest .tree_cache import get_default_cache
231+ self ._tree_cache = get_default_cache ()
232+ except ImportError :
233+ logger .debug ("TreeCache not available, parsing will not be cached" )
234+
224235 # Language support matrix
225236 self .supported_languages = {
226237 "python" : {"ast" : True , "tree_sitter" : True },
@@ -234,7 +245,49 @@ def __init__(self, use_tree_sitter: bool = True):
234245 "ruby" : {"ast" : False , "tree_sitter" : True },
235246 }
236247
237- logger .info (f"ASTAnalyzer initialized: tree_sitter={ self .use_tree_sitter } " )
248+ logger .info (f"ASTAnalyzer initialized: tree_sitter={ self .use_tree_sitter } , tree_cache={ 'enabled' if self ._tree_cache else 'disabled' } " )
249+
250+ def _parse_with_cache (self , parser : Any , content : str , file_path : str , language : str , content_provided : bool = False ) -> Optional [Any ]:
251+ """Parse content with tree-sitter, using cache when available.
252+
253+ Args:
254+ parser: Tree-sitter parser instance
255+ content: Source code content
256+ file_path: Path to the file (used as cache key)
257+ language: Programming language
258+ content_provided: If True, content was explicitly provided (not read from disk),
259+ so skip cache to avoid returning stale tree
260+
261+ Returns:
262+ Parsed tree or None on failure
263+ """
264+ path = Path (file_path ) if file_path else None
265+
266+ # Try to get cached tree (only for real files when content was NOT explicitly provided)
267+ # If content_provided=True, the caller passed in-memory content that may differ from disk
268+ if self ._tree_cache and path and path .exists () and not content_provided :
269+ cached_tree = self ._tree_cache .get (path )
270+ if cached_tree is not None :
271+ return cached_tree
272+
273+ # Parse the content
274+ try :
275+ tree = parser .parse (content .encode ("utf-8" ))
276+ except Exception as e :
277+ logger .debug (f"Tree-sitter parse failed for { language } : { e } " )
278+ return None
279+
280+ # Cache the result for real files
281+ if self ._tree_cache and path and path .exists () and tree is not None :
282+ self ._tree_cache .put (path , tree )
283+
284+ return tree
285+
286+ def get_tree_cache_stats (self ) -> Dict [str , Any ]:
287+ """Get tree cache statistics for monitoring."""
288+ if self ._tree_cache :
289+ return self ._tree_cache .get_stats ()
290+ return {"enabled" : False }
238291
239292 def analyze_file (
240293 self , file_path : str , language : str , content : Optional [str ] = None
@@ -250,6 +303,10 @@ def analyze_file(
250303 Returns:
251304 Dict with symbols, imports, calls, and dependencies
252305 """
306+ # Track if content was explicitly provided (vs read from disk)
307+ # This affects caching - explicit content may differ from on-disk state
308+ content_provided = content is not None
309+
253310 if content is None :
254311 try :
255312 content = Path (file_path ).read_text (encoding = "utf-8" , errors = "ignore" )
@@ -259,7 +316,7 @@ def analyze_file(
259316
260317 # Use language mappings (32 languages, declarative queries)
261318 if _LANGUAGE_MAPPINGS_AVAILABLE and self .use_tree_sitter :
262- result = self ._analyze_with_mapping (content , file_path , language )
319+ result = self ._analyze_with_mapping (content , file_path , language , content_provided )
263320 if result and (result .get ("symbols" ) or result .get ("imports" ) or result .get ("calls" )):
264321 return result
265322
@@ -438,11 +495,17 @@ def extract_dependencies(
438495
439496 # ---- Language Mappings Analysis (unified, concept-based) ----
440497
441- def _analyze_with_mapping (self , content : str , file_path : str , language : str ) -> Dict [str , Any ]:
498+ def _analyze_with_mapping (self , content : str , file_path : str , language : str , content_provided : bool = False ) -> Dict [str , Any ]:
442499 """Analyze code using language mappings (concept-based extraction).
443500
444501 This uses the declarative tree-sitter queries from language_mappings
445502 to extract symbols, imports, and calls. Supports 34 languages.
503+
504+ Args:
505+ content: Source code content
506+ file_path: Path to the file
507+ language: Programming language
508+ content_provided: If True, content was explicitly provided (not read from disk)
446509 """
447510 if not _LANGUAGE_MAPPINGS_AVAILABLE :
448511 return self ._empty_analysis ()
@@ -461,12 +524,12 @@ def _analyze_with_mapping(self, content: str, file_path: str, language: str) ->
461524 if not parser :
462525 return self ._empty_analysis ()
463526
464- try :
465- tree = parser .parse (content .encode ("utf-8" ))
466- root = tree .root_node
467- except Exception as e :
468- logger .debug (f"Tree-sitter parse failed for { language } : { e } " )
527+ # Parse with caching (avoids re-parsing unchanged files)
528+ # Skip cache if content was explicitly provided to avoid stale results
529+ tree = self ._parse_with_cache (parser , content , file_path , language , content_provided )
530+ if tree is None :
469531 return self ._empty_analysis ()
532+ root = tree .root_node
470533
471534 content_bytes = content .encode ("utf-8" )
472535 symbols : List [CodeSymbol ] = []
0 commit comments