Skip to content

Commit 2f3cf95

Browse files
committed
perf(sync): return final markdown from batch indexer
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent 8f8d017 commit 2f3cf95

6 files changed

Lines changed: 140 additions & 17 deletions

File tree

src/basic_memory/indexing/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
IndexFileMetadata,
99
IndexFileWriter,
1010
IndexFrontmatterUpdate,
11+
IndexFrontmatterWriteResult,
1112
IndexingBatchResult,
1213
IndexInputFile,
1314
IndexProgress,
@@ -20,6 +21,7 @@
2021
"IndexFileMetadata",
2122
"IndexFileWriter",
2223
"IndexFrontmatterUpdate",
24+
"IndexFrontmatterWriteResult",
2325
"IndexingBatchResult",
2426
"IndexInputFile",
2527
"IndexProgress",

src/basic_memory/indexing/batch_indexer.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from sqlalchemy.exc import IntegrityError
1414

1515
from basic_memory.config import BasicMemoryConfig
16-
from basic_memory.file_utils import compute_checksum, has_frontmatter
16+
from basic_memory.file_utils import compute_checksum, has_frontmatter, remove_frontmatter
1717
from basic_memory.markdown.schemas import EntityMarkdown
1818
from basic_memory.indexing.models import (
1919
IndexedEntity,
@@ -47,6 +47,7 @@ class _PreparedEntity:
4747
checksum: str
4848
content_type: str | None
4949
search_content: str | None
50+
markdown_content: str | None = None
5051

5152

5253
class BatchIndexer:
@@ -253,6 +254,7 @@ async def _normalize_markdown_file(
253254
reserved_permalinks: set[str],
254255
) -> _PreparedMarkdownFile:
255256
final_checksum = prepared.final_checksum
257+
final_content = prepared.content
256258
final_permalink = await self._resolve_batch_permalink(prepared, reserved_permalinks)
257259

258260
# Trigger: markdown file has no frontmatter and sync enforcement is enabled.
@@ -264,9 +266,11 @@ async def _normalize_markdown_file(
264266
"type": prepared.markdown.frontmatter.type,
265267
"permalink": final_permalink,
266268
}
267-
final_checksum = await self.file_writer.write_frontmatter(
269+
write_result = await self.file_writer.write_frontmatter(
268270
IndexFrontmatterUpdate(path=prepared.file.path, metadata=frontmatter_updates)
269271
)
272+
final_checksum = write_result.checksum
273+
final_content = write_result.content
270274
prepared.markdown.frontmatter.metadata.update(frontmatter_updates)
271275

272276
# Trigger: existing markdown frontmatter may lack the canonical permalink.
@@ -278,16 +282,18 @@ async def _normalize_markdown_file(
278282
and final_permalink != prepared.markdown.frontmatter.permalink
279283
):
280284
prepared.markdown.frontmatter.metadata["permalink"] = final_permalink
281-
final_checksum = await self.file_writer.write_frontmatter(
285+
write_result = await self.file_writer.write_frontmatter(
282286
IndexFrontmatterUpdate(
283287
path=prepared.file.path,
284288
metadata={"permalink": final_permalink},
285289
)
286290
)
291+
final_checksum = write_result.checksum
292+
final_content = write_result.content
287293

288294
return _PreparedMarkdownFile(
289295
file=prepared.file,
290-
content=prepared.content,
296+
content=final_content,
291297
final_checksum=final_checksum,
292298
markdown=prepared.markdown,
293299
file_contains_frontmatter=prepared.file_contains_frontmatter,
@@ -351,7 +357,8 @@ async def _upsert_markdown_file(self, prepared: _PreparedMarkdownFile) -> _Prepa
351357
entity_id=updated.id,
352358
checksum=prepared.final_checksum,
353359
content_type=prepared.file.content_type,
354-
search_content=prepared.markdown.content,
360+
search_content=remove_frontmatter(prepared.content),
361+
markdown_content=prepared.content,
355362
)
356363

357364
async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity:
@@ -412,6 +419,7 @@ async def _upsert_regular_file(self, file: IndexInputFile) -> _PreparedEntity:
412419
checksum=checksum,
413420
content_type=file.content_type,
414421
search_content=None,
422+
markdown_content=None,
415423
)
416424

417425
# --- Relations ---
@@ -487,6 +495,7 @@ async def _refresh_search_index(
487495
permalink=entity.permalink,
488496
checksum=prepared.checksum,
489497
content_type=prepared.content_type,
498+
markdown_content=prepared.markdown_content,
490499
)
491500

492501
# --- Helpers ---

src/basic_memory/indexing/models.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,14 @@ class IndexFrontmatterUpdate:
5555
metadata: dict[str, Any]
5656

5757

58+
@dataclass(slots=True)
59+
class IndexFrontmatterWriteResult:
60+
"""Typed result for a frontmatter write performed during indexing."""
61+
62+
checksum: str
63+
content: str
64+
65+
5866
@dataclass(slots=True)
5967
class IndexedEntity:
6068
"""Stable output describing one file that finished indexing successfully."""
@@ -64,6 +72,7 @@ class IndexedEntity:
6472
permalink: str | None
6573
checksum: str
6674
content_type: str | None = None
75+
markdown_content: str | None = None
6776

6877

6978
@dataclass(slots=True)
@@ -80,4 +89,6 @@ class IndexingBatchResult:
8089
class IndexFileWriter(Protocol):
8190
"""Narrow protocol for frontmatter writes during indexing."""
8291

83-
async def write_frontmatter(self, update: IndexFrontmatterUpdate) -> str: ...
92+
async def write_frontmatter(
93+
self, update: IndexFrontmatterUpdate
94+
) -> IndexFrontmatterWriteResult: ...

src/basic_memory/services/file_service.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import asyncio
44
import hashlib
55
import mimetypes
6+
from dataclasses import dataclass
67
from datetime import datetime
78
from pathlib import Path
89
from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
@@ -25,6 +26,14 @@
2526
from loguru import logger
2627

2728

29+
@dataclass(slots=True)
30+
class FrontmatterUpdateResult:
31+
"""Final content emitted by a frontmatter rewrite without a follow-up reread."""
32+
33+
checksum: str
34+
content: str
35+
36+
2837
class FileService:
2938
"""Service for handling file operations with concurrency control.
3039
@@ -401,12 +410,14 @@ async def move_file(self, source: FilePath, destination: FilePath) -> None:
401410
)
402411
raise FileOperationError(f"Failed to move file {source} -> {destination}: {e}")
403412

404-
async def update_frontmatter(self, path: FilePath, updates: Dict[str, Any]) -> str:
405-
"""Update frontmatter fields in a file while preserving all content.
413+
async def update_frontmatter_with_result(
414+
self, path: FilePath, updates: Dict[str, Any]
415+
) -> FrontmatterUpdateResult:
416+
"""Update frontmatter and return the exact final written markdown content.
406417
407418
Only modifies the frontmatter section, leaving all content untouched.
408419
Creates frontmatter section if none exists.
409-
Returns checksum of updated file.
420+
Returns both checksum and final content so callers do not need a reread.
410421
411422
Uses aiofiles for true async I/O (non-blocking).
412423
@@ -415,7 +426,7 @@ async def update_frontmatter(self, path: FilePath, updates: Dict[str, Any]) -> s
415426
updates: Dict of frontmatter fields to update
416427
417428
Returns:
418-
Checksum of updated file
429+
Typed result containing checksum and final content
419430
420431
Raises:
421432
FileOperationError: If file operations fail
@@ -467,7 +478,10 @@ async def update_frontmatter(self, path: FilePath, updates: Dict[str, Any]) -> s
467478
if formatted_content is not None:
468479
content_for_checksum = formatted_content # pragma: no cover
469480

470-
return await file_utils.compute_checksum(content_for_checksum)
481+
return FrontmatterUpdateResult(
482+
checksum=await file_utils.compute_checksum(content_for_checksum),
483+
content=content_for_checksum,
484+
)
471485

472486
except Exception as e: # pragma: no cover
473487
# Only log real errors (not YAML parsing, which is handled above)
@@ -479,6 +493,11 @@ async def update_frontmatter(self, path: FilePath, updates: Dict[str, Any]) -> s
479493
)
480494
raise FileOperationError(f"Failed to update frontmatter: {e}")
481495

496+
async def update_frontmatter(self, path: FilePath, updates: Dict[str, Any]) -> str:
497+
"""Update frontmatter fields in a file while preserving all content."""
498+
result = await self.update_frontmatter_with_result(path, updates)
499+
return result.checksum
500+
482501
async def compute_checksum(self, path: FilePath) -> str:
483502
"""Compute checksum for a file using true async I/O.
484503

src/basic_memory/sync/sync_service.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,12 @@
2121
from basic_memory.file_utils import has_frontmatter
2222
from basic_memory.indexing import BatchIndexer, IndexFileMetadata, IndexInputFile, IndexProgress
2323
from basic_memory.indexing.batching import build_index_batches
24-
from basic_memory.indexing.models import IndexedEntity, IndexFileWriter, IndexFrontmatterUpdate
24+
from basic_memory.indexing.models import (
25+
IndexedEntity,
26+
IndexFileWriter,
27+
IndexFrontmatterUpdate,
28+
IndexFrontmatterWriteResult,
29+
)
2530
from basic_memory.ignore_utils import load_bmignore_patterns, should_ignore_path
2631
from basic_memory.markdown import EntityParser, MarkdownProcessor
2732
from basic_memory.models import Entity, Project
@@ -127,8 +132,13 @@ class _FileServiceIndexWriter(IndexFileWriter):
127132
def __init__(self, file_service: FileService) -> None:
128133
self.file_service = file_service
129134

130-
async def write_frontmatter(self, update: IndexFrontmatterUpdate) -> str:
131-
return await self.file_service.update_frontmatter(update.path, update.metadata)
135+
async def write_frontmatter(
136+
self, update: IndexFrontmatterUpdate
137+
) -> IndexFrontmatterWriteResult:
138+
result = await self.file_service.update_frontmatter_with_result(
139+
update.path, update.metadata
140+
)
141+
return IndexFrontmatterWriteResult(checksum=result.checksum, content=result.content)
132142

133143

134144
class SyncService:

tests/indexing/test_batch_indexer.py

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@
99
import pytest
1010
from sqlalchemy import text
1111

12-
from basic_memory.indexing import BatchIndexer, IndexFrontmatterUpdate, IndexInputFile
12+
from basic_memory.indexing import (
13+
BatchIndexer,
14+
IndexFrontmatterUpdate,
15+
IndexFrontmatterWriteResult,
16+
IndexInputFile,
17+
)
1318

1419

1520
class _TestFileWriter:
@@ -18,8 +23,13 @@ class _TestFileWriter:
1823
def __init__(self, file_service) -> None:
1924
self.file_service = file_service
2025

21-
async def write_frontmatter(self, update: IndexFrontmatterUpdate) -> str:
22-
return await self.file_service.update_frontmatter(update.path, update.metadata)
26+
async def write_frontmatter(
27+
self, update: IndexFrontmatterUpdate
28+
) -> IndexFrontmatterWriteResult:
29+
result = await self.file_service.update_frontmatter_with_result(
30+
update.path, update.metadata
31+
)
32+
return IndexFrontmatterWriteResult(checksum=result.checksum, content=result.content)
2333

2434

2535
async def _create_file(path: Path, content: str | bytes) -> None:
@@ -214,6 +224,51 @@ async def spy_upsert(*args, **kwargs):
214224
assert result.errors == []
215225

216226

227+
@pytest.mark.asyncio
228+
async def test_batch_indexer_returns_original_markdown_content_when_no_frontmatter_rewrite(
229+
app_config,
230+
entity_service,
231+
entity_repository,
232+
relation_repository,
233+
search_service,
234+
file_service,
235+
project_config,
236+
):
237+
app_config.disable_permalinks = True
238+
239+
path = "notes/original.md"
240+
original_content = dedent(
241+
"""
242+
---
243+
title: Original
244+
type: note
245+
---
246+
# Original
247+
"""
248+
).strip()
249+
await _create_file(project_config.home / path, original_content)
250+
251+
files = {path: await _load_input(file_service, path)}
252+
batch_indexer = _make_batch_indexer(
253+
app_config,
254+
entity_service,
255+
entity_repository,
256+
relation_repository,
257+
search_service,
258+
file_service,
259+
)
260+
261+
result = await batch_indexer.index_files(
262+
files,
263+
max_concurrent=1,
264+
parse_max_concurrent=1,
265+
)
266+
267+
assert result.errors == []
268+
assert len(result.indexed) == 1
269+
assert result.indexed[0].markdown_content == original_content
270+
271+
217272
@pytest.mark.asyncio
218273
async def test_batch_indexer_indexes_non_markdown_files(
219274
app_config,
@@ -249,6 +304,7 @@ async def test_batch_indexer_indexes_non_markdown_files(
249304
)
250305

251306
assert {indexed.path for indexed in result.indexed} == {pdf_path, image_path}
307+
assert all(indexed.markdown_content is None for indexed in result.indexed)
252308

253309
pdf_entity = await entity_repository.get_by_file_path(pdf_path)
254310
image_entity = await entity_repository.get_by_file_path(image_path)
@@ -377,6 +433,11 @@ async def test_batch_indexer_assigns_unique_permalinks_for_batch_local_conflicts
377433
path_one: await _load_input(file_service, path_one),
378434
path_two: await _load_input(file_service, path_two),
379435
}
436+
original_contents = {
437+
path: file.content.decode("utf-8")
438+
for path, file in files.items()
439+
if file.content is not None
440+
}
380441
batch_indexer = _make_batch_indexer(
381442
app_config,
382443
entity_service,
@@ -393,6 +454,17 @@ async def test_batch_indexer_assigns_unique_permalinks_for_batch_local_conflicts
393454
)
394455

395456
assert result.errors == []
457+
indexed_by_path = {indexed.path: indexed for indexed in result.indexed}
458+
assert indexed_by_path[path_one].markdown_content is not None
459+
assert indexed_by_path[path_two].markdown_content is not None
460+
assert indexed_by_path[path_one].markdown_content != original_contents[path_one]
461+
assert indexed_by_path[path_two].markdown_content != original_contents[path_two]
462+
assert indexed_by_path[path_one].markdown_content == await file_service.read_file_content(
463+
path_one
464+
)
465+
assert indexed_by_path[path_two].markdown_content == await file_service.read_file_content(
466+
path_two
467+
)
396468

397469
entities = await entity_repository.find_all()
398470
assert len(entities) == 2

0 commit comments

Comments
 (0)