Skip to content

Commit edb7991

Browse files
phernandezclaude
andauthored
fix: strip NUL bytes from content before PostgreSQL search indexing (#592)
Signed-off-by: phernandez <paul@basicmachines.co> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d7faeb7 commit edb7991

10 files changed

Lines changed: 221 additions & 18 deletions

File tree

CHANGELOG.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,19 @@
1010
- JSON output now includes an additive `frontmatter` field with parsed YAML metadata (or `null`
1111
when no valid opening frontmatter block exists).
1212

13-
## v0.18.3 (2026-02-12)
13+
## v0.18.5 (2026-02-13)
14+
15+
### Bug Fixes
16+
17+
- Strip NUL bytes from content before PostgreSQL search indexing
18+
([`ec9b2c4`](https://github.com/basicmachines-co/basic-memory/commit/ec9b2c4))
19+
20+
## v0.18.4 (2026-02-12)
1421

1522
### Bug Fixes
1623

1724
- Use global `--header` flag for Tigris consistency on all rclone transactions
18-
([`7fcf587`](https://github.com/basicmachines-co/basic-memory/commit/7fcf587))
25+
([`0eae0e1`](https://github.com/basicmachines-co/basic-memory/commit/0eae0e1))
1926
- `--header-download` / `--header-upload` only apply to GET/PUT requests, missing S3
2027
ListObjectsV2 calls that bisync issues first. Non-US users saw stale edge-cached metadata.
2128
- `--header` applies to ALL HTTP transactions (list, download, upload), fixing bisync for

server.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@
66
"url": "https://github.com/basicmachines-co/basic-memory.git",
77
"source": "github"
88
},
9-
"version": "0.18.3",
9+
"version": "0.18.5",
1010
"packages": [
1111
{
1212
"registryType": "pypi",
1313
"identifier": "basic-memory",
14-
"version": "0.18.3",
14+
"version": "0.18.5",
1515
"runtimeHint": "uvx",
1616
"runtimeArguments": [
1717
{"type": "positional", "value": "basic-memory"},

src/basic_memory/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""basic-memory - Local-first knowledge management combining Zettelkasten with knowledge graphs"""
22

33
# Package version - updated by release automation
4-
__version__ = "0.18.3"
4+
__version__ = "0.18.5"
55

66
# API version for FastAPI - independent of package version
77
__api_version__ = "v0"

src/basic_memory/cli/commands/cloud/rclone_commands.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,9 @@ def project_sync(
223223
*TIGRIS_CONSISTENCY_HEADERS,
224224
"--filter-from",
225225
str(filter_path),
226+
# Prevent NUL byte padding on virtual filesystems (e.g. Google Drive File Stream)
227+
# See: rclone/rclone#6801
228+
"--local-no-preallocate",
226229
]
227230

228231
if verbose:
@@ -299,6 +302,9 @@ def project_bisync(
299302
str(filter_path),
300303
"--workdir",
301304
str(state_path),
305+
# Prevent NUL byte padding on virtual filesystems (e.g. Google Drive File Stream)
306+
# See: rclone/rclone#6801
307+
"--local-no-preallocate",
302308
]
303309

304310
# Add --create-empty-src-dirs if rclone version supports it (v1.64+)

src/basic_memory/repository/postgres_search_repository.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,15 @@
2424
from basic_memory.schemas.search import SearchItemType, SearchRetrievalMode
2525

2626

27+
def _strip_nul_from_row(row_data: dict) -> dict:
28+
"""Strip NUL bytes from all string values in a row dict.
29+
30+
Secondary defense: PostgreSQL text columns cannot store \\x00.
31+
Primary sanitization happens in SearchService.index_entity_markdown().
32+
"""
33+
return {k: v.replace("\x00", "") if isinstance(v, str) else v for k, v in row_data.items()}
34+
35+
2736
class PostgresSearchRepository(SearchRepositoryBase):
2837
"""PostgreSQL tsvector implementation of search repository.
2938
@@ -92,6 +101,7 @@ async def index_item(self, search_index_row: SearchIndexRow) -> None:
92101
# Serialize JSON for raw SQL
93102
insert_data = search_index_row.to_insert(serialize_json=True)
94103
insert_data["project_id"] = self.project_id
104+
insert_data = _strip_nul_from_row(insert_data)
95105

96106
# Use upsert to handle race conditions during parallel indexing
97107
# ON CONFLICT (permalink, project_id) matches the partial unique index
@@ -533,7 +543,7 @@ async def bulk_index_items(self, search_index_rows: List[SearchIndexRow]) -> Non
533543
for row in search_index_rows:
534544
insert_data = row.to_insert(serialize_json=True)
535545
insert_data["project_id"] = self.project_id
536-
insert_data_list.append(insert_data)
546+
insert_data_list.append(_strip_nul_from_row(insert_data))
537547

538548
# Use upsert to handle race conditions during parallel indexing
539549
# ON CONFLICT (permalink, project_id) matches the partial unique index

src/basic_memory/services/search_service.py

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,15 @@
5858
}
5959

6060

61+
def _strip_nul(value: str) -> str:
62+
"""Strip NUL bytes that PostgreSQL text columns cannot store.
63+
64+
rclone preallocation on virtual filesystems (e.g. Google Drive File Stream)
65+
can pad files with \\x00 bytes. See: rclone/rclone#6801
66+
"""
67+
return value.replace("\x00", "")
68+
69+
6170
def _mtime_to_datetime(entity: Entity) -> datetime:
6271
"""Convert entity mtime (file modification time) to datetime.
6372
@@ -402,7 +411,7 @@ async def index_entity_file(
402411
id=entity.id,
403412
entity_id=entity.id,
404413
type=SearchItemType.ENTITY.value,
405-
title=entity.title,
414+
title=_strip_nul(entity.title),
406415
permalink=entity.permalink, # Required for Postgres NOT NULL constraint
407416
file_path=entity.file_path,
408417
metadata={
@@ -461,7 +470,7 @@ async def index_entity_markdown(
461470
# Store full content for vector embedding quality.
462471
# The chunker in the vector pipeline splits this into
463472
# appropriately-sized pieces for embedding.
464-
content_snippet = content
473+
content_snippet = _strip_nul(content)
465474

466475
if entity.permalink:
467476
content_stems.extend(self._generate_variants(entity.permalink))
@@ -473,7 +482,7 @@ async def index_entity_markdown(
473482
if entity_tags:
474483
content_stems.extend(entity_tags)
475484

476-
entity_content_stems = "\n".join(p for p in content_stems if p and p.strip())
485+
entity_content_stems = _strip_nul("\n".join(p for p in content_stems if p and p.strip()))
477486

478487
# Truncate to stay under Postgres's 8KB index row limit
479488
if len(entity_content_stems) > MAX_CONTENT_STEMS_SIZE: # pragma: no cover
@@ -484,7 +493,7 @@ async def index_entity_markdown(
484493
SearchIndexRow(
485494
id=entity.id,
486495
type=SearchItemType.ENTITY.value,
487-
title=entity.title,
496+
title=_strip_nul(entity.title),
488497
content_stems=entity_content_stems,
489498
content_snippet=content_snippet,
490499
permalink=entity.permalink,
@@ -510,8 +519,8 @@ async def index_entity_markdown(
510519
seen_permalinks.add(obs_permalink)
511520

512521
# Index with parent entity's file path since that's where it's defined
513-
obs_content_stems = "\n".join(
514-
p for p in self._generate_variants(obs.content) if p and p.strip()
522+
obs_content_stems = _strip_nul(
523+
"\n".join(p for p in self._generate_variants(obs.content) if p and p.strip())
515524
)
516525
# Truncate to stay under Postgres's 8KB index row limit
517526
if len(obs_content_stems) > MAX_CONTENT_STEMS_SIZE: # pragma: no cover
@@ -520,9 +529,9 @@ async def index_entity_markdown(
520529
SearchIndexRow(
521530
id=obs.id,
522531
type=SearchItemType.OBSERVATION.value,
523-
title=f"{obs.category}: {obs.content[:100]}...",
532+
title=_strip_nul(f"{obs.category}: {obs.content[:100]}..."),
524533
content_stems=obs_content_stems,
525-
content_snippet=obs.content,
534+
content_snippet=_strip_nul(obs.content),
526535
permalink=obs_permalink,
527536
file_path=entity.file_path,
528537
category=obs.category,
@@ -539,14 +548,14 @@ async def index_entity_markdown(
539548
# Add relation rows (only outgoing relations defined in this file)
540549
for rel in entity.outgoing_relations:
541550
# Create descriptive title showing the relationship
542-
relation_title = (
551+
relation_title = _strip_nul(
543552
f"{rel.from_entity.title}{rel.to_entity.title}"
544553
if rel.to_entity
545554
else f"{rel.from_entity.title}"
546555
)
547556

548-
rel_content_stems = "\n".join(
549-
p for p in self._generate_variants(relation_title) if p and p.strip()
557+
rel_content_stems = _strip_nul(
558+
"\n".join(p for p in self._generate_variants(relation_title) if p and p.strip())
550559
)
551560
rows_to_index.append(
552561
SearchIndexRow(

tests/mcp/test_tool_contracts.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
"move_note": [
4343
"identifier",
4444
"destination_path",
45+
"destination_folder",
4546
"is_directory",
4647
"project",
4748
"workspace",

tests/repository/test_postgres_search_repository.py

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,10 @@
1111

1212
from basic_memory import db
1313
from basic_memory.config import BasicMemoryConfig, DatabaseBackend
14-
from basic_memory.repository.postgres_search_repository import PostgresSearchRepository
14+
from basic_memory.repository.postgres_search_repository import (
15+
PostgresSearchRepository,
16+
_strip_nul_from_row,
17+
)
1518
from basic_memory.repository.semantic_errors import SemanticSearchDisabledError
1619
from basic_memory.repository.search_index_row import SearchIndexRow
1720
from basic_memory.schemas.search import SearchItemType, SearchRetrievalMode
@@ -237,6 +240,76 @@ async def test_postgres_search_repository_reraises_non_tsquery_db_errors(
237240
await repo.search(permalink="docs/anything")
238241

239242

243+
@pytest.mark.asyncio
244+
async def test_bulk_index_items_strips_nul_bytes(session_maker, test_project):
245+
"""NUL bytes in content must not cause CharacterNotInRepertoireError on INSERT."""
246+
repo = PostgresSearchRepository(session_maker, project_id=test_project.id)
247+
now = datetime.now(timezone.utc)
248+
row = SearchIndexRow(
249+
project_id=test_project.id,
250+
id=99,
251+
title="hello\x00world",
252+
content_stems="some\x00stems",
253+
content_snippet="snippet\x00here",
254+
permalink="test/nul-row",
255+
file_path="test/nul.md",
256+
type="entity",
257+
metadata={"entity_type": "note"},
258+
created_at=now,
259+
updated_at=now,
260+
)
261+
# Should not raise CharacterNotInRepertoireError
262+
await repo.bulk_index_items([row])
263+
results = await repo.search(permalink="test/nul-row")
264+
assert len(results) == 1
265+
assert "\x00" not in (results[0].content_snippet or "")
266+
assert "\x00" not in (results[0].title or "")
267+
268+
269+
@pytest.mark.asyncio
270+
async def test_index_item_strips_nul_bytes(session_maker, test_project):
271+
"""NUL bytes in single-item index_item path must not cause CharacterNotInRepertoireError."""
272+
repo = PostgresSearchRepository(session_maker, project_id=test_project.id)
273+
now = datetime.now(timezone.utc)
274+
row = SearchIndexRow(
275+
project_id=test_project.id,
276+
id=98,
277+
title="single\x00item",
278+
content_stems="nul\x00stems",
279+
content_snippet="nul\x00snippet",
280+
permalink="test/nul-single",
281+
file_path="test/nul-single.md",
282+
type="entity",
283+
metadata={"entity_type": "note"},
284+
created_at=now,
285+
updated_at=now,
286+
)
287+
await repo.index_item(row)
288+
results = await repo.search(permalink="test/nul-single")
289+
assert len(results) == 1
290+
assert "\x00" not in (results[0].content_snippet or "")
291+
assert "\x00" not in (results[0].title or "")
292+
293+
294+
def test_strip_nul_from_row():
295+
"""_strip_nul_from_row strips NUL bytes from string values, leaves non-strings alone."""
296+
row = {
297+
"title": "hello\x00world",
298+
"content_stems": "some\x00content\x00here",
299+
"content_snippet": "clean",
300+
"id": 42,
301+
"metadata": None,
302+
"created_at": datetime(2024, 1, 1),
303+
}
304+
result = _strip_nul_from_row(row)
305+
assert result["title"] == "helloworld"
306+
assert result["content_stems"] == "somecontenthere"
307+
assert result["content_snippet"] == "clean"
308+
assert result["id"] == 42
309+
assert result["metadata"] is None
310+
assert result["created_at"] == datetime(2024, 1, 1)
311+
312+
240313
@pytest.mark.asyncio
241314
async def test_postgres_semantic_vector_search_returns_ranked_entities(session_maker, test_project):
242315
"""Vector mode ranks entities via pgvector distance."""

tests/services/test_search_service.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from basic_memory import db
99
from basic_memory.repository.search_index_row import SearchIndexRow
1010
from basic_memory.schemas.search import SearchQuery, SearchItemType, SearchRetrievalMode
11+
from basic_memory.services.search_service import _strip_nul
1112

1213

1314
@pytest.mark.asyncio
@@ -1116,6 +1117,60 @@ async def test_index_entity_multiple_categories_same_content(
11161117
assert len(results) >= 2
11171118

11181119

1120+
# Tests for NUL byte stripping
1121+
1122+
1123+
def test_strip_nul_removes_nul_bytes():
1124+
"""_strip_nul removes \\x00 from strings."""
1125+
assert _strip_nul("hello\x00world") == "helloworld"
1126+
assert _strip_nul("\x00\x00\x00") == ""
1127+
assert _strip_nul("clean string") == "clean string"
1128+
1129+
1130+
@pytest.mark.asyncio
1131+
async def test_index_entity_markdown_strips_nul_bytes(search_service, session_maker, test_project):
1132+
"""Content with NUL bytes should be stripped before indexing.
1133+
1134+
rclone preallocation on virtual filesystems (e.g. Google Drive File Stream)
1135+
can pad files with \\x00 bytes, causing PostgreSQL CharacterNotInRepertoireError.
1136+
1137+
Note: NUL bytes arrive via file content read from disk, not from the database.
1138+
Postgres rejects \\x00 in text columns at the ORM level, so we only test
1139+
the content path (passed to index_entity) rather than observation creation.
1140+
"""
1141+
from basic_memory.repository import EntityRepository
1142+
from basic_memory.repository.search_repository import SearchRepository
1143+
1144+
entity_repo = EntityRepository(session_maker, project_id=test_project.id)
1145+
1146+
entity_data = {
1147+
"title": "NUL Test Entity",
1148+
"entity_type": "note",
1149+
"entity_metadata": {},
1150+
"content_type": "text/markdown",
1151+
"file_path": "test/nul-test.md",
1152+
"permalink": "test/nul-test",
1153+
"project_id": test_project.id,
1154+
"created_at": datetime.now(),
1155+
"updated_at": datetime.now(),
1156+
}
1157+
entity = await entity_repo.create(entity_data)
1158+
entity = await entity_repo.get_by_permalink("test/nul-test")
1159+
1160+
# Index with NUL-containing content (simulates rclone-preallocated file)
1161+
nul_content = "# NUL Test\x00\x00\nSome content\x00here"
1162+
await search_service.index_entity(entity, content=nul_content)
1163+
1164+
# Verify no NUL bytes in stored search index rows
1165+
search_repo: SearchRepository = search_service.repository
1166+
results = await search_repo.search(permalink_match="test/nul-test*")
1167+
for row in results:
1168+
if row.content_snippet:
1169+
assert "\x00" not in row.content_snippet, (
1170+
f"NUL found in content_snippet for {row.permalink}"
1171+
)
1172+
1173+
11191174
@pytest.mark.asyncio
11201175
async def test_reindex_vectors(search_service, session_maker, test_project):
11211176
"""Test that reindex_vectors processes all entities and reports stats."""

0 commit comments

Comments
 (0)