Skip to content
Open
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
f75b46b
mit-salvage: reintroduce _graph edges accelerator + admin lifecycle p…
voarsh Feb 9, 2026
d4be80a
Admin: fix option to clear indexing caches
voarsh Feb 9, 2026
2e59455
fix(upload-client): stop dev-workspace recursion in dev-remote watch …
voarsh Feb 9, 2026
881f1f4
fix(indexer): harden graph edge backfill + align upload client ignore…
voarsh Feb 9, 2026
7961a72
chore: harden graph-edge ops, cache uploader excludes, and sync helpers
voarsh Feb 9, 2026
2476d6c
collection_admin: Fix missing logging
voarsh Feb 13, 2026
b43054b
Adds debug mode to repo search
voarsh Feb 13, 2026
843d79f
bridge: make MCP list timeouts configurable and gate OAuth metadata f…
voarsh Feb 13, 2026
2f14eff
fix(mcp): correct template dedupe uri source and clean debug field ha…
voarsh Feb 13, 2026
a8656de
vscode-ext: Adds bundled MCP bridge mode
voarsh Feb 13, 2026
6cecc26
Add back Claude Code workflow for GH
voarsh Feb 14, 2026
ec69b2b
Improves upload client and code search handling
voarsh Feb 14, 2026
ba3d336
Prompts for venv creation when auto-detection fails
voarsh Feb 14, 2026
a620125
Updates session defaults on ID change
voarsh Mar 2, 2026
7ed96d9
refactor(bridge): consolidate session defaults sync
voarsh Mar 6, 2026
036c677
fix(search): change `under` filter to recursive subtree scope
voarsh Mar 7, 2026
ba1e9c2
refactor(ingest): add async git history processing and structured log…
voarsh Mar 7, 2026
d30e1c4
fix(vscode-uploader): restore watch startup after successful auto for…
voarsh Mar 7, 2026
24b7c3f
refactor(ingest): improve logging practices and thread safety
voarsh Mar 7, 2026
fb560c1
fix(uploader): restore incremental sync cache and reduce Windows Pyth…
voarsh Mar 7, 2026
8c05f45
feat(upload): add hash-based deduplication and processing status trac…
voarsh Mar 7, 2026
366b6f4
feat(upload): cleanup ignored cached paths and prune empty directorie…
voarsh Mar 7, 2026
0a380b9
feat(upload): add interval-based empty dir sweep and fix force sync i…
voarsh Mar 7, 2026
168f22f
feat(upload): add plan/apply workflow for delta uploads
voarsh Mar 7, 2026
ca32c5a
fix(upload,watch): align cache state with confirmed uploads and trim …
voarsh Mar 8, 2026
673ad7e
fix(ingest,watch): tolerate line shifts and reduce redundant reproces…
voarsh Mar 8, 2026
6f243ec
feat(vscode): extend MCP bridge auto-start to support sse-remote mode
voarsh Mar 8, 2026
ca0c8b3
feat(watch,upload): add index journal for durable change tracking and…
voarsh Mar 9, 2026
c6fcf50
fix(core): improve pagination, upload reliability, and watch consistency
voarsh Mar 9, 2026
ecaf1c1
fix(code review): address critical and major issues from CodeRabbit
github-actions[bot] Mar 9, 2026
86f2212
fix(watch,consistency): improve error handling and retry logic
github-actions[bot] Mar 9, 2026
6bd58ea
fix(ingest,watch,upload): address CodeRabbit critical and major issues
github-actions[bot] Mar 9, 2026
99e9433
refactor(upload_service): extract duplicated collection resolution logic
github-actions[bot] Mar 9, 2026
37890c2
fix(upload,consistency): address CodeRabbit critical and major issues
github-actions[bot] Mar 9, 2026
37349d4
fix(watch,upload): restore internal path checks and async queued uplo…
voarsh Mar 9, 2026
984838d
fix(ingest,watch,upload): improve error handling and smart reindex fa…
voarsh Mar 9, 2026
b349a74
fix(ingest,upload,search): improve error handling and fix edge cases
voarsh Mar 9, 2026
ed627c7
fix(bridge): add retry logic for transient errors in MCP list operations
voarsh Mar 9, 2026
e161c14
ci(cosqa): add benchmark workflow and search matrix runner
voarsh Mar 9, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 35 additions & 3 deletions scripts/collection_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ def delete_collection_everywhere(
out: Dict[str, Any] = {
"collection": name,
"qdrant_deleted": False,
"qdrant_graph_deleted": False,
"registry_marked_deleted": False,
"deleted_state_files": 0,
"deleted_managed_workspaces": 0,
Expand All @@ -209,6 +210,14 @@ def delete_collection_everywhere(
out["qdrant_deleted"] = True
except Exception:
out["qdrant_deleted"] = False
# Best-effort: also delete companion graph edges collection when present.
# This branch stores file-level edges in `<collection>_graph`.
if not name.endswith("_graph"):
try:
cli.delete_collection(collection_name=f"{name}_graph")
out["qdrant_graph_deleted"] = True
except Exception:
out["qdrant_graph_deleted"] = False
except Exception:
out["qdrant_deleted"] = False

Expand Down Expand Up @@ -359,8 +368,10 @@ def _manual_copy_points() -> None:
vectors_config = None
sparse_vectors_config = None

# Support vector-less collections (e.g. payload-only graph edge collections).
if vectors_config is None:
raise RuntimeError(f"Cannot determine vectors config for source collection {src}")
vectors_config = {}
vectorless = isinstance(vectors_config, dict) and not vectors_config

try:
cli.create_collection(
Expand Down Expand Up @@ -401,7 +412,7 @@ def _manual_copy_points() -> None:
limit=batch_limit,
offset=offset,
with_payload=True,
with_vectors=True,
with_vectors=(not vectorless),
)
except Exception as exc:
raise RuntimeError(f"Failed to scroll points from {src}: {exc}") from exc
Expand All @@ -414,7 +425,9 @@ def _manual_copy_points() -> None:
point_id = getattr(record, "id", None)
payload = getattr(record, "payload", None)
vector = None
if hasattr(record, "vector") and getattr(record, "vector") is not None:
if vectorless:
vector = {}
elif hasattr(record, "vector") and getattr(record, "vector") is not None:
vector = getattr(record, "vector")
elif hasattr(record, "vectors") and getattr(record, "vectors") is not None:
vector = getattr(record, "vectors")
Expand Down Expand Up @@ -477,4 +490,23 @@ def _count_points(name: str) -> Optional[int]:
# The manual path guarantees the destination gets the exact same points/payloads/vectors.
_manual_copy_points()

# Best-effort: copy the companion graph collection when copying a base collection.
# Graph edges are derived data and can be rebuilt, but copying avoids a cold-start window
# during staging cutovers where the clone has no graph.
if not src.endswith("_graph") and not dest.endswith("_graph"):
try:
copy_collection_qdrant(
source=f"{src}_graph",
target=f"{dest}_graph",
qdrant_url=base_url,
overwrite=overwrite,
)
except Exception as exc:
logger.debug(
"Best-effort graph collection copy %s_graph -> %s_graph failed: %s",
src,
dest,
exc,
)
Comment thread
coderabbitai[bot] marked this conversation as resolved.

return dest
31 changes: 25 additions & 6 deletions scripts/indexing_admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,6 +927,17 @@ def delete_collection_qdrant(*, qdrant_url: str, api_key: Optional[str], collect
return
try:
cli.delete_collection(collection_name=name)
# Best-effort: also delete companion graph edges collection when present.
if not name.endswith("_graph"):
try:
cli.delete_collection(collection_name=f"{name}_graph")
except Exception as exc:
try:
print(
f"[indexing_admin] best-effort graph collection delete failed for {name}_graph: {exc}"
)
except Exception:
pass
except Exception:
pass
finally:
Expand All @@ -951,6 +962,17 @@ def recreate_collection_qdrant(*, qdrant_url: str, api_key: Optional[str], colle
cli.delete_collection(collection_name=name)
except Exception as delete_error:
raise RuntimeError(f"Failed to delete existing collection '{name}' in Qdrant: {delete_error}") from delete_error
# Best-effort: also delete companion graph edges collection when present.
if not name.endswith("_graph"):
try:
cli.delete_collection(collection_name=f"{name}_graph")
except Exception as exc:
try:
print(
f"[indexing_admin] best-effort graph collection delete failed for {name}_graph: {exc}"
)
except Exception:
pass
finally:
try:
cli.close()
Expand Down Expand Up @@ -984,12 +1006,9 @@ def spawn_ingest_code(
env.pop(k, None)
else:
env[str(k)] = str(v)
# When we provide env overrides for a run (e.g. staging rebuild), we also want to
# force ingest_code to honor the explicit COLLECTION_NAME instead of routing based
# on per-repo state/serving_collection in multi-repo mode.
# CTXCE_FORCE_COLLECTION_NAME is only used for these subprocess runs; normal watcher
# and indexer flows do not set it.
env["CTXCE_FORCE_COLLECTION_NAME"] = "1" # Force ingest_code to use COLLECTION_NAME for staging/pending env overrides
# For admin-triggered subprocess runs (recreate/reindex/staging), force ingest_code to
# honor explicit COLLECTION_NAME and avoid multi-repo enumeration.
env["CTXCE_FORCE_COLLECTION_NAME"] = "1"
env["COLLECTION_NAME"] = collection
env["WATCH_ROOT"] = work_dir
env["WORKSPACE_PATH"] = work_dir
Expand Down
29 changes: 27 additions & 2 deletions scripts/ingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
is_multi_repo_mode,
get_collection_name,
)
from scripts import workspace_state as _ws
from scripts.collection_health import clear_indexing_caches as _clear_indexing_caches_impl
from scripts.ingest.pipeline import index_repo
from scripts.ingest.pseudo import generate_pseudo_tags

Expand All @@ -40,6 +42,11 @@ def parse_args():
action="store_true",
help="Do not skip files whose content hash matches existing index",
)
parser.add_argument(
"--clear-indexing-caches",
action="store_true",
help="Clear local indexing caches (file hash/symbol caches) before indexing",
)
parser.add_argument(
"--schema-mode",
type=str,
Expand Down Expand Up @@ -186,13 +193,25 @@ def main():
)
return

def _clear_indexing_caches(workspace_root: Path, repo_name: str | None) -> None:
try:
_clear_indexing_caches_impl(str(workspace_root), repo_name=repo_name)
except Exception:
pass

qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333")
api_key = os.environ.get("QDRANT_API_KEY")
collection = os.environ.get("COLLECTION_NAME") or os.environ.get("DEFAULT_COLLECTION") or "codebase"
model_name = os.environ.get("EMBEDDING_MODEL", "BAAI/bge-base-en-v1.5")

# Resolve collection name based on multi-repo mode
multi_repo = bool(is_multi_repo_mode and is_multi_repo_mode())
force_collection = (os.environ.get("CTXCE_FORCE_COLLECTION_NAME") or "").strip().lower() in {
"1",
"true",
"yes",
"on",
}
multi_repo = bool(is_multi_repo_mode and is_multi_repo_mode()) and not force_collection
if multi_repo:
print("[multi_repo] Multi-repo mode enabled - will create separate collections per repository")

Expand Down Expand Up @@ -231,6 +250,9 @@ def main():
if not repo_collection:
repo_collection = "codebase"

if args.clear_indexing_caches:
_clear_indexing_caches(root_path, repo_name)

index_repo(
repo_root,
qdrant_url,
Expand All @@ -249,7 +271,7 @@ def main():
try:
resolved = get_collection_name(str(Path(args.root).resolve()))
placeholders = {"", "default-collection", "my-collection", "codebase"}
if resolved and collection in placeholders:
if resolved and collection in placeholders and not force_collection:
collection = resolved
except Exception:
pass
Expand All @@ -260,6 +282,9 @@ def main():
flag = (os.environ.get("PSEUDO_DEFER_TO_WORKER") or "").strip().lower()
pseudo_mode = "off" if flag in {"1", "true", "yes", "on"} else "full"

if args.clear_indexing_caches:
_clear_indexing_caches(Path(args.root).resolve(), None)

index_repo(
Path(args.root).resolve(),
qdrant_url,
Expand Down
Loading