Skip to content

Commit 97a8a87

Browse files
committed
Improve ASTAnalyzer cache logic and add ingest tests
Refines ASTAnalyzer to avoid returning stale trees when content is provided in-memory, by tracking content provenance and skipping cache in such cases. Enhances flush_upserts in Qdrant ingest to clarify consistency semantics and add a minimal scroll for better write visibility. Adds comprehensive tests for chunk deduplication, ingest infrastructure, and tree cache integration to ensure correctness and robustness.
1 parent e08a6f3 commit 97a8a87

5 files changed

Lines changed: 1840 additions & 10 deletions

File tree

scripts/ast_analyzer.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -247,22 +247,25 @@ def __init__(self, use_tree_sitter: bool = True, use_tree_cache: bool = True):
247247

248248
logger.info(f"ASTAnalyzer initialized: tree_sitter={self.use_tree_sitter}, tree_cache={'enabled' if self._tree_cache else 'disabled'}")
249249

250-
def _parse_with_cache(self, parser: Any, content: str, file_path: str, language: str) -> Optional[Any]:
250+
def _parse_with_cache(self, parser: Any, content: str, file_path: str, language: str, content_provided: bool = False) -> Optional[Any]:
251251
"""Parse content with tree-sitter, using cache when available.
252252
253253
Args:
254254
parser: Tree-sitter parser instance
255255
content: Source code content
256256
file_path: Path to the file (used as cache key)
257257
language: Programming language
258+
content_provided: If True, content was explicitly provided (not read from disk),
259+
so skip cache to avoid returning stale tree
258260
259261
Returns:
260262
Parsed tree or None on failure
261263
"""
262264
path = Path(file_path) if file_path else None
263265

264-
# Try to get cached tree (only for real files, not in-memory content)
265-
if self._tree_cache and path and path.exists():
266+
# Try to get cached tree (only for real files when content was NOT explicitly provided)
267+
# If content_provided=True, the caller passed in-memory content that may differ from disk
268+
if self._tree_cache and path and path.exists() and not content_provided:
266269
cached_tree = self._tree_cache.get(path)
267270
if cached_tree is not None:
268271
return cached_tree
@@ -300,6 +303,10 @@ def analyze_file(
300303
Returns:
301304
Dict with symbols, imports, calls, and dependencies
302305
"""
306+
# Track if content was explicitly provided (vs read from disk)
307+
# This affects caching - explicit content may differ from on-disk state
308+
content_provided = content is not None
309+
303310
if content is None:
304311
try:
305312
content = Path(file_path).read_text(encoding="utf-8", errors="ignore")
@@ -309,7 +316,7 @@ def analyze_file(
309316

310317
# Use language mappings (32 languages, declarative queries)
311318
if _LANGUAGE_MAPPINGS_AVAILABLE and self.use_tree_sitter:
312-
result = self._analyze_with_mapping(content, file_path, language)
319+
result = self._analyze_with_mapping(content, file_path, language, content_provided)
313320
if result and (result.get("symbols") or result.get("imports") or result.get("calls")):
314321
return result
315322

@@ -488,11 +495,17 @@ def extract_dependencies(
488495

489496
# ---- Language Mappings Analysis (unified, concept-based) ----
490497

491-
def _analyze_with_mapping(self, content: str, file_path: str, language: str) -> Dict[str, Any]:
498+
def _analyze_with_mapping(self, content: str, file_path: str, language: str, content_provided: bool = False) -> Dict[str, Any]:
492499
"""Analyze code using language mappings (concept-based extraction).
493500
494501
This uses the declarative tree-sitter queries from language_mappings
495502
to extract symbols, imports, and calls. Supports 34 languages.
503+
504+
Args:
505+
content: Source code content
506+
file_path: Path to the file
507+
language: Programming language
508+
content_provided: If True, content was explicitly provided (not read from disk)
496509
"""
497510
if not _LANGUAGE_MAPPINGS_AVAILABLE:
498511
return self._empty_analysis()
@@ -512,7 +525,8 @@ def _analyze_with_mapping(self, content: str, file_path: str, language: str) ->
512525
return self._empty_analysis()
513526

514527
# Parse with caching (avoids re-parsing unchanged files)
515-
tree = self._parse_with_cache(parser, content, file_path, language)
528+
# Skip cache if content was explicitly provided to avoid stale results
529+
tree = self._parse_with_cache(parser, content, file_path, language, content_provided)
516530
if tree is None:
517531
return self._empty_analysis()
518532
root = tree.root_node

scripts/ingest/qdrant.py

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -931,16 +931,40 @@ def upsert_points(
931931

932932

933933
def flush_upserts(client: QdrantClient, collection: str) -> None:
934-
"""Ensure all pending async upserts are committed.
934+
"""Best-effort sync for pending async upserts.
935935
936-
Call this after a batch of async upserts to ensure data is persisted
937-
before reading or querying.
936+
Call this after a batch of async upserts (INDEX_UPSERT_ASYNC=1) to improve
937+
likelihood that data is visible for subsequent reads.
938+
939+
IMPORTANT: Qdrant's wait=False semantics mean upserts are "confirmed received"
940+
but not necessarily "applied". This function performs operations that encourage
941+
the server to process pending writes, but cannot guarantee immediate consistency.
942+
943+
For strict consistency requirements:
944+
- Use wait=True (INDEX_UPSERT_ASYNC=0) during upserts, or
945+
- Add application-level retry logic for read-after-write scenarios
946+
947+
For remote deployments, network latency may increase the window between
948+
upsert confirmation and data visibility.
949+
950+
Args:
951+
client: Qdrant client instance
952+
collection: Collection name
938953
"""
939954
if not collection:
940955
return
941956
try:
942-
# Force a sync operation to ensure all pending writes are flushed
957+
# 1. Get collection info (lightweight metadata read)
943958
client.get_collection(collection)
959+
960+
# 2. Perform a minimal scroll to encourage segment processing
961+
# This touches actual data, which helps flush pending writes
962+
client.scroll(
963+
collection_name=collection,
964+
limit=1,
965+
with_payload=False,
966+
with_vectors=False,
967+
)
944968
except Exception as e:
945969
logger.debug(f"flush_upserts: {e}")
946970

0 commit comments

Comments
 (0)