Skip to content

Commit 1c343be

Browse files
committed
perf(sync): skip unchanged markdown indexing
Signed-off-by: phernandez <paul@basicmachines.co>
1 parent c50d97e commit 1c343be

2 files changed

Lines changed: 113 additions & 0 deletions

File tree

src/basic_memory/sync/sync_service.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1087,6 +1087,22 @@ async def sync_one_markdown_file(
10871087
initial_markdown_content = initial_markdown_bytes.decode("utf-8")
10881088
file_metadata = await self.file_service.get_file_metadata(path)
10891089
initial_checksum = await compute_checksum(initial_markdown_bytes)
1090+
existing_entity = await self.entity_repository.get_by_file_path(path)
1091+
if existing_entity is not None and existing_entity.checksum == initial_checksum:
1092+
logger.debug(
1093+
f"Markdown sync skipped unchanged file: path={path}, "
1094+
f"entity_id={existing_entity.id}, checksum={initial_checksum[:8]}"
1095+
)
1096+
return SyncedMarkdownFile(
1097+
entity=existing_entity,
1098+
checksum=initial_checksum,
1099+
markdown_content=initial_markdown_content,
1100+
file_path=path,
1101+
content_type=self.file_service.content_type(path),
1102+
updated_at=file_metadata.modified_at,
1103+
size=file_metadata.size,
1104+
)
1105+
10901106
indexed = await self.batch_indexer.index_markdown_file(
10911107
IndexInputFile(
10921108
path=path,

tests/sync/test_sync_one_markdown_file.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,103 @@ async def test_sync_one_markdown_file_does_not_reread_for_initial_checksum_when_
194194
assert result.checksum == await compute_checksum(file_path.read_bytes())
195195

196196

197+
@pytest.mark.asyncio
198+
async def test_sync_one_markdown_file_skips_indexing_when_checksum_matches(
199+
sync_service,
200+
test_project,
201+
monkeypatch,
202+
):
203+
"""A matching DB checksum is the consistency boundary for derived indexes."""
204+
original_content = dedent(
205+
f"""\
206+
---
207+
title: Already Current
208+
type: note
209+
permalink: {test_project.name}/notes/already-current
210+
---
211+
212+
# Already Current
213+
214+
- [note] Derived indexes are assumed current when the file checksum matches.
215+
"""
216+
)
217+
file_path = _write_markdown(
218+
Path(test_project.path),
219+
"notes/already-current.md",
220+
original_content,
221+
)
222+
223+
initial = await sync_service.sync_one_markdown_file(
224+
"notes/already-current.md",
225+
index_search=False,
226+
)
227+
assert initial.checksum == await compute_checksum(file_path.read_bytes())
228+
229+
index_markdown_file = AsyncMock(side_effect=AssertionError("indexer should not run"))
230+
index_entity_data = AsyncMock(side_effect=AssertionError("search should not refresh"))
231+
monkeypatch.setattr(sync_service.batch_indexer, "index_markdown_file", index_markdown_file)
232+
monkeypatch.setattr(sync_service.search_service, "index_entity_data", index_entity_data)
233+
234+
result = await sync_service.sync_one_markdown_file("notes/already-current.md")
235+
236+
index_markdown_file.assert_not_awaited()
237+
index_entity_data.assert_not_awaited()
238+
assert result.entity.id == initial.entity.id
239+
assert len(result.entity.observations) == 1
240+
assert result.markdown_content == file_path.read_bytes().decode("utf-8")
241+
assert result.checksum == initial.checksum
242+
243+
244+
@pytest.mark.asyncio
245+
async def test_sync_one_markdown_file_indexes_when_checksum_differs(
246+
sync_service,
247+
test_project,
248+
monkeypatch,
249+
):
250+
"""A DB checksum mismatch still takes the full indexing path."""
251+
initial_content = dedent(
252+
f"""\
253+
---
254+
title: Changed
255+
type: note
256+
permalink: {test_project.name}/notes/changed
257+
---
258+
259+
# Changed
260+
261+
Original body.
262+
"""
263+
)
264+
file_path = _write_markdown(
265+
Path(test_project.path),
266+
"notes/changed.md",
267+
initial_content,
268+
)
269+
initial = await sync_service.sync_one_markdown_file("notes/changed.md", index_search=False)
270+
271+
updated_content = initial_content.replace("Original body.", "Updated body.")
272+
file_path.write_text(updated_content, encoding="utf-8")
273+
274+
original_index_markdown_file = sync_service.batch_indexer.index_markdown_file
275+
276+
async def index_markdown_file_spy(*args, **kwargs):
277+
return await original_index_markdown_file(*args, **kwargs)
278+
279+
index_markdown_file = AsyncMock(side_effect=index_markdown_file_spy)
280+
index_entity_data = AsyncMock()
281+
monkeypatch.setattr(sync_service.batch_indexer, "index_markdown_file", index_markdown_file)
282+
monkeypatch.setattr(sync_service.search_service, "index_entity_data", index_entity_data)
283+
284+
result = await sync_service.sync_one_markdown_file("notes/changed.md")
285+
286+
index_markdown_file.assert_awaited_once()
287+
index_entity_data.assert_awaited_once()
288+
assert result.entity.id == initial.entity.id
289+
assert result.markdown_content == file_path.read_bytes().decode("utf-8")
290+
assert result.checksum == await compute_checksum(file_path.read_bytes())
291+
assert result.checksum != initial.checksum
292+
293+
197294
@pytest.mark.asyncio
198295
async def test_sync_markdown_file_remains_tuple_compatible(sync_service, test_project):
199296
"""The legacy tuple-returning API still works for existing callers."""

0 commit comments

Comments
 (0)