Skip to content

Commit c4cf0af

Browse files
authored
perf(sync): speed up single markdown file indexing (#751)
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 1c343be commit c4cf0af

9 files changed

Lines changed: 447 additions & 60 deletions

File tree

src/basic_memory/indexing/batch_indexer.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ async def index_markdown_file(
186186
new: bool | None = None,
187187
existing_permalink_by_path: dict[str, str | None] | None = None,
188188
index_search: bool = True,
189+
resolve_relations: bool = True,
189190
) -> IndexedEntity:
190191
"""Index one markdown file using the same normalization and upsert path as batches."""
191192
if not self._is_markdown(file):
@@ -212,7 +213,12 @@ async def index_markdown_file(
212213
existing_permalink_by_path[file.path] = prepared.markdown.frontmatter.permalink
213214

214215
with telemetry.span("index.markdown_file.persist", path=file.path, is_new=new):
215-
persisted = await self._persist_markdown_file(prepared, is_new=new)
216+
persisted = await self._persist_markdown_file(
217+
prepared,
218+
is_new=new,
219+
resolve_relations=resolve_relations,
220+
reload_entity=False,
221+
)
216222
existing_permalink_by_path[file.path] = persisted.entity.permalink
217223

218224
with telemetry.span(
@@ -550,6 +556,8 @@ async def _persist_markdown_file(
550556
prepared: _PreparedMarkdownFile,
551557
*,
552558
is_new: bool | None = None,
559+
resolve_relations: bool = True,
560+
reload_entity: bool = True,
553561
) -> _PersistedMarkdownFile:
554562
existing = await self.entity_repository.get_by_file_path(
555563
prepared.file.path,
@@ -561,15 +569,20 @@ async def _persist_markdown_file(
561569
Path(prepared.file.path),
562570
prepared.markdown,
563571
is_new=is_new,
572+
existing_entity=existing,
573+
resolve_relations=resolve_relations,
574+
reload_entity=reload_entity,
564575
)
565576
prepared = await self._reconcile_persisted_permalink(prepared, entity)
566-
updated = await self.entity_repository.update(
577+
metadata_updates = self._entity_metadata_updates(prepared.file, prepared.final_checksum)
578+
updated = await self.entity_repository.update_fields(
567579
entity.id,
568-
self._entity_metadata_updates(prepared.file, prepared.final_checksum),
580+
metadata_updates,
569581
)
570-
if updated is None:
582+
if not updated:
571583
raise ValueError(f"Failed to update markdown entity metadata for {prepared.file.path}")
572-
return _PersistedMarkdownFile(prepared=prepared, entity=updated)
584+
self._apply_entity_metadata_updates(entity, metadata_updates)
585+
return _PersistedMarkdownFile(prepared=prepared, entity=entity)
573586

574587
async def _reconcile_persisted_permalink(
575588
self,
@@ -659,6 +672,11 @@ def _entity_metadata_updates(
659672
updates["content_type"] = file.content_type
660673
return updates
661674

675+
def _apply_entity_metadata_updates(self, entity: Entity, updates: dict[str, object]) -> None:
676+
"""Keep the returned entity aligned with metadata written without reload."""
677+
for key, value in updates.items():
678+
setattr(entity, key, value)
679+
662680
def _is_markdown(self, file: IndexInputFile) -> bool:
663681
if file.content_type is not None:
664682
return file.content_type == "text/markdown"

src/basic_memory/repository/entity_repository.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(self, session_maker: async_sessionmaker[AsyncSession], project_id:
3333
"""
3434
super().__init__(session_maker, Entity, project_id=project_id)
3535

36-
async def get_by_id(self, entity_id: int) -> Optional[Entity]: # pragma: no cover
36+
async def get_by_id(self, entity_id: int, *, load_relations: bool = True) -> Optional[Entity]:
3737
"""Get entity by numeric ID.
3838
3939
Args:
@@ -43,6 +43,10 @@ async def get_by_id(self, entity_id: int) -> Optional[Entity]: # pragma: no cov
4343
Entity if found, None otherwise
4444
"""
4545
async with db.scoped_session(self.session_maker) as session:
46+
if not load_relations:
47+
result = await session.execute(self.select().where(Entity.id == entity_id))
48+
return result.scalars().one_or_none()
49+
4650
return await self.select_by_id(session, entity_id)
4751

4852
async def _find_one_by_query(self, query, *, load_relations: bool) -> Optional[Entity]:

src/basic_memory/repository/repository.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
Result,
1414
and_,
1515
delete,
16+
update as sqlalchemy_update,
1617
)
1718
from sqlalchemy.engine import CursorResult
1819
from sqlalchemy.exc import NoResultFound
@@ -140,6 +141,20 @@ async def add_all(self, models: List[T]) -> Sequence[T]:
140141
# Query within same session
141142
return await self.select_by_ids(session, [m.id for m in models]) # pyright: ignore [reportAttributeAccessIssue]
142143

144+
async def add_all_no_return(self, models: List[T]) -> int:
145+
"""Insert models without reloading them afterward."""
146+
if not models:
147+
return 0
148+
149+
async with db.scoped_session(self.session_maker) as session:
150+
for model in models:
151+
self._set_project_id_if_needed(model)
152+
153+
session.add_all(models)
154+
await session.flush()
155+
logger.debug(f"Added {len(models)} {self.Model.__name__} records")
156+
return len(models)
157+
143158
def select(self, *entities: Any) -> Select:
144159
"""Create a new SELECT statement.
145160
@@ -298,6 +313,25 @@ async def update(self, entity_id: int, entity_data: dict[str, Any] | T) -> Optio
298313
logger.debug(f"No {self.Model.__name__} found to update: {entity_id}")
299314
return None
300315

316+
async def update_fields(self, entity_id: Any, entity_data: dict[str, Any]) -> bool:
317+
"""Update columns without reloading the model graph afterward."""
318+
update_data = {k: v for k, v in entity_data.items() if k in self.valid_columns}
319+
if not update_data:
320+
return True
321+
322+
async with db.scoped_session(self.session_maker) as session:
323+
conditions = [self.primary_key == entity_id]
324+
if self.has_project_id and self.project_id is not None:
325+
conditions.append(getattr(self.Model, "project_id") == self.project_id)
326+
327+
result = cast(
328+
CursorResult[Any],
329+
await session.execute(
330+
sqlalchemy_update(self.Model).where(and_(*conditions)).values(**update_data)
331+
),
332+
)
333+
return result.rowcount > 0
334+
301335
async def delete(self, entity_id: int) -> bool:
302336
"""Delete an entity from the database."""
303337
logger.debug(f"Deleting {self.Model.__name__}: {entity_id}")

0 commit comments

Comments
 (0)