Skip to content

Commit 8f8d017

Browse files
committed
perf(sync): batch file indexing in core
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 8c81d3c commit 8f8d017

14 files changed

Lines changed: 1850 additions & 66 deletions

src/basic_memory/cli/commands/db.py

Lines changed: 71 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Database management commands."""
22

3+
from dataclasses import dataclass
34
from pathlib import Path
45

56
import typer
@@ -12,13 +13,47 @@
1213
from basic_memory.cli.app import app
1314
from basic_memory.cli.commands.command_utils import run_with_cleanup
1415
from basic_memory.config import ConfigManager, ProjectMode
16+
from basic_memory.indexing import IndexProgress
1517
from basic_memory.repository import ProjectRepository
1618
from basic_memory.services.initialization import reconcile_projects_with_config
1719
from basic_memory.sync.sync_service import get_sync_service
1820

1921
console = Console()
2022

2123

24+
@dataclass(slots=True)
25+
class EmbeddingProgress:
26+
"""Typed CLI progress payload for embedding backfills."""
27+
28+
entity_id: int
29+
index: int
30+
total: int
31+
32+
33+
def _format_eta(seconds: float | None) -> str:
34+
"""Render a compact ETA string for CLI progress descriptions."""
35+
if seconds is None:
36+
return "--:--"
37+
38+
whole_seconds = max(int(seconds), 0)
39+
minutes, remaining_seconds = divmod(whole_seconds, 60)
40+
hours, remaining_minutes = divmod(minutes, 60)
41+
if hours:
42+
return f"{hours:d}:{remaining_minutes:02d}:{remaining_seconds:02d}"
43+
return f"{remaining_minutes:02d}:{remaining_seconds:02d}"
44+
45+
46+
def _format_index_progress(progress: IndexProgress) -> str:
47+
"""Render typed index progress as a compact Rich task description."""
48+
files_per_minute = int(progress.files_per_minute) if progress.files_per_minute else 0
49+
return (
50+
" Indexing files... "
51+
f"{progress.files_processed}/{progress.files_total} files | "
52+
f"{progress.batches_completed}/{progress.batches_total} batches | "
53+
f"{files_per_minute}/min | ETA {_format_eta(progress.eta_seconds)}"
54+
)
55+
56+
2257
async def _reindex_projects(app_config):
2358
"""Reindex all projects in a single async context.
2459
@@ -185,10 +220,34 @@ async def _reindex(app_config, search: bool, embeddings: bool, project: str | No
185220
console.print(f"\n[bold]Project: [cyan]{proj.name}[/cyan][/bold]")
186221

187222
if search:
188-
console.print(" Rebuilding full-text search index...")
189223
sync_service = await get_sync_service(proj)
190224
sync_dir = Path(proj.path)
191-
await sync_service.sync(sync_dir, project_name=proj.name)
225+
with Progress(
226+
SpinnerColumn(),
227+
TextColumn("[progress.description]{task.description}"),
228+
BarColumn(),
229+
TaskProgressColumn(),
230+
console=console,
231+
) as progress:
232+
task = progress.add_task(" Indexing files... scanning changes", total=1)
233+
234+
async def on_index_progress(update: IndexProgress) -> None:
235+
total = update.files_total or 1
236+
completed = update.files_processed if update.files_total else 1
237+
progress.update(
238+
task,
239+
description=_format_index_progress(update),
240+
total=total,
241+
completed=min(completed, total),
242+
)
243+
244+
await sync_service.sync(
245+
sync_dir,
246+
project_name=proj.name,
247+
progress_callback=on_index_progress,
248+
)
249+
progress.update(task, completed=progress.tasks[task].total or 1)
250+
192251
console.print(" [green]✓[/green] Full-text search index rebuilt")
193252

194253
if embeddings:
@@ -213,7 +272,16 @@ async def _reindex(app_config, search: bool, embeddings: bool, project: str | No
213272
task = progress.add_task(" Embedding entities...", total=None)
214273

215274
def on_progress(entity_id, index, total):
216-
progress.update(task, total=total, completed=index)
275+
embedding_progress = EmbeddingProgress(
276+
entity_id=entity_id,
277+
index=index,
278+
total=total,
279+
)
280+
progress.update(
281+
task,
282+
total=embedding_progress.total,
283+
completed=embedding_progress.index,
284+
)
217285

218286
stats = await search_service.reindex_vectors(progress_callback=on_progress)
219287
progress.update(task, completed=stats["total_entities"])

src/basic_memory/config.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,11 @@ class BasicMemoryConfig(BaseSettings):
193193
description="Batch size for embedding generation.",
194194
gt=0,
195195
)
196+
semantic_embedding_request_concurrency: int = Field(
197+
default=4,
198+
description="Maximum number of concurrent provider requests for batched embedding generation when the active provider supports request-level concurrency.",
199+
gt=0,
200+
)
196201
semantic_embedding_sync_batch_size: int = Field(
197202
default=64,
198203
description="Batch size for vector sync orchestration flushes.",
@@ -286,6 +291,31 @@ class BasicMemoryConfig(BaseSettings):
286291
description="Maximum number of files to process concurrently during sync. Limits memory usage on large projects (2000+ files). Lower values reduce memory consumption.",
287292
gt=0,
288293
)
294+
index_batch_size: int = Field(
295+
default=32,
296+
description="Maximum number of changed files to load into one indexing batch.",
297+
gt=0,
298+
)
299+
index_batch_max_bytes: int = Field(
300+
default=8 * 1024 * 1024,
301+
description="Maximum total bytes to load into one indexing batch. Large files still run as single-file batches.",
302+
gt=0,
303+
)
304+
index_parse_max_concurrent: int = Field(
305+
default=8,
306+
description="Maximum number of markdown parse tasks to run concurrently inside one indexing batch.",
307+
gt=0,
308+
)
309+
index_entity_max_concurrent: int = Field(
310+
default=4,
311+
description="Maximum number of entity create/update tasks to run concurrently inside one indexing batch.",
312+
gt=0,
313+
)
314+
index_metadata_update_max_concurrent: int = Field(
315+
default=4,
316+
description="Maximum number of metadata/search refresh tasks to run concurrently inside one indexing batch.",
317+
gt=0,
318+
)
289319

290320
kebab_filenames: bool = Field(
291321
default=False,
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""Reusable indexing primitives shared by local sync and future remote callers."""
2+
3+
from basic_memory.indexing.batch_indexer import BatchIndexer
4+
from basic_memory.indexing.batching import build_index_batches
5+
from basic_memory.indexing.models import (
6+
IndexedEntity,
7+
IndexBatch,
8+
IndexFileMetadata,
9+
IndexFileWriter,
10+
IndexFrontmatterUpdate,
11+
IndexingBatchResult,
12+
IndexInputFile,
13+
IndexProgress,
14+
)
15+
16+
__all__ = [
17+
"BatchIndexer",
18+
"IndexedEntity",
19+
"IndexBatch",
20+
"IndexFileMetadata",
21+
"IndexFileWriter",
22+
"IndexFrontmatterUpdate",
23+
"IndexingBatchResult",
24+
"IndexInputFile",
25+
"IndexProgress",
26+
"build_index_batches",
27+
]

0 commit comments

Comments
 (0)