fix: strip NUL bytes from content before PostgreSQL search indexing (#592)

phernandez · claude · web-flow · commit edb7991ccf1f · 2026-02-20T18:11:04.000-06:00
Signed-off-by: phernandez &lt;paul@basicmachines.co&gt;
Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,12 +10,19 @@
   - JSON output now includes an additive `frontmatter` field with parsed YAML metadata (or `null`
     when no valid opening frontmatter block exists).
 
-## v0.18.3 (2026-02-12)
+## v0.18.5 (2026-02-13)
+
+### Bug Fixes
+
+- Strip NUL bytes from content before PostgreSQL search indexing
+  ([`ec9b2c4`](https://github.com/basicmachines-co/basic-memory/commit/ec9b2c4))
+
+## v0.18.4 (2026-02-12)
 
 ### Bug Fixes
 
 - Use global `--header` flag for Tigris consistency on all rclone transactions
-  ([`7fcf587`](https://github.com/basicmachines-co/basic-memory/commit/7fcf587))
+  ([`0eae0e1`](https://github.com/basicmachines-co/basic-memory/commit/0eae0e1))
   - `--header-download` / `--header-upload` only apply to GET/PUT requests, missing S3
     ListObjectsV2 calls that bisync issues first. Non-US users saw stale edge-cached metadata.
   - `--header` applies to ALL HTTP transactions (list, download, upload), fixing bisync for
diff --git a/server.json b/server.json
@@ -6,12 +6,12 @@
     "url": "https://github.com/basicmachines-co/basic-memory.git",
     "source": "github"
   },
-  "version": "0.18.3",
+  "version": "0.18.5",
   "packages": [
     {
       "registryType": "pypi",
       "identifier": "basic-memory",
-      "version": "0.18.3",
+      "version": "0.18.5",
       "runtimeHint": "uvx",
       "runtimeArguments": [
         {"type": "positional", "value": "basic-memory"},
diff --git a/src/basic_memory/__init__.py b/src/basic_memory/__init__.py
@@ -1,7 +1,7 @@
 """basic-memory - Local-first knowledge management combining Zettelkasten with knowledge graphs"""
 
 # Package version - updated by release automation
-__version__ = "0.18.3"
+__version__ = "0.18.5"
 
 # API version for FastAPI - independent of package version
 __api_version__ = "v0"
diff --git a/src/basic_memory/cli/commands/cloud/rclone_commands.py b/src/basic_memory/cli/commands/cloud/rclone_commands.py
@@ -223,6 +223,9 @@ def project_sync(
         *TIGRIS_CONSISTENCY_HEADERS,
         "--filter-from",
         str(filter_path),
+        # Prevent NUL byte padding on virtual filesystems (e.g. Google Drive File Stream)
+        # See: rclone/rclone#6801
+        "--local-no-preallocate",
     ]
 
     if verbose:
@@ -299,6 +302,9 @@ def project_bisync(
         str(filter_path),
         "--workdir",
         str(state_path),
+        # Prevent NUL byte padding on virtual filesystems (e.g. Google Drive File Stream)
+        # See: rclone/rclone#6801
+        "--local-no-preallocate",
     ]
 
     # Add --create-empty-src-dirs if rclone version supports it (v1.64+)
diff --git a/src/basic_memory/repository/postgres_search_repository.py b/src/basic_memory/repository/postgres_search_repository.py
@@ -24,6 +24,15 @@
 from basic_memory.schemas.search import SearchItemType, SearchRetrievalMode
 
 
+def _strip_nul_from_row(row_data: dict) -> dict:
+    """Strip NUL bytes from all string values in a row dict.
+
+    Secondary defense: PostgreSQL text columns cannot store \\x00.
+    Primary sanitization happens in SearchService.index_entity_markdown().
+    """
+    return {k: v.replace("\x00", "") if isinstance(v, str) else v for k, v in row_data.items()}
+
+
 class PostgresSearchRepository(SearchRepositoryBase):
     """PostgreSQL tsvector implementation of search repository.
 
@@ -92,6 +101,7 @@ async def index_item(self, search_index_row: SearchIndexRow) -> None:
             # Serialize JSON for raw SQL
             insert_data = search_index_row.to_insert(serialize_json=True)
             insert_data["project_id"] = self.project_id
+            insert_data = _strip_nul_from_row(insert_data)
 
             # Use upsert to handle race conditions during parallel indexing
             # ON CONFLICT (permalink, project_id) matches the partial unique index
@@ -533,7 +543,7 @@ async def bulk_index_items(self, search_index_rows: List[SearchIndexRow]) -> Non
             for row in search_index_rows:
                 insert_data = row.to_insert(serialize_json=True)
                 insert_data["project_id"] = self.project_id
-                insert_data_list.append(insert_data)
+                insert_data_list.append(_strip_nul_from_row(insert_data))
 
             # Use upsert to handle race conditions during parallel indexing
             # ON CONFLICT (permalink, project_id) matches the partial unique index
diff --git a/src/basic_memory/services/search_service.py b/src/basic_memory/services/search_service.py
@@ -58,6 +58,15 @@
 }
 
 
+def _strip_nul(value: str) -> str:
+    """Strip NUL bytes that PostgreSQL text columns cannot store.
+
+    rclone preallocation on virtual filesystems (e.g. Google Drive File Stream)
+    can pad files with \\x00 bytes. See: rclone/rclone#6801
+    """
+    return value.replace("\x00", "")
+
+
 def _mtime_to_datetime(entity: Entity) -> datetime:
     """Convert entity mtime (file modification time) to datetime.
 
@@ -402,7 +411,7 @@ async def index_entity_file(
                 id=entity.id,
                 entity_id=entity.id,
                 type=SearchItemType.ENTITY.value,
-                title=entity.title,
+                title=_strip_nul(entity.title),
                 permalink=entity.permalink,  # Required for Postgres NOT NULL constraint
                 file_path=entity.file_path,
                 metadata={
@@ -461,7 +470,7 @@ async def index_entity_markdown(
             # Store full content for vector embedding quality.
             # The chunker in the vector pipeline splits this into
             # appropriately-sized pieces for embedding.
-            content_snippet = content
+            content_snippet = _strip_nul(content)
 
         if entity.permalink:
             content_stems.extend(self._generate_variants(entity.permalink))
@@ -473,7 +482,7 @@ async def index_entity_markdown(
         if entity_tags:
             content_stems.extend(entity_tags)
 
-        entity_content_stems = "\n".join(p for p in content_stems if p and p.strip())
+        entity_content_stems = _strip_nul("\n".join(p for p in content_stems if p and p.strip()))
 
         # Truncate to stay under Postgres's 8KB index row limit
         if len(entity_content_stems) > MAX_CONTENT_STEMS_SIZE:  # pragma: no cover
@@ -484,7 +493,7 @@ async def index_entity_markdown(
             SearchIndexRow(
                 id=entity.id,
                 type=SearchItemType.ENTITY.value,
-                title=entity.title,
+                title=_strip_nul(entity.title),
                 content_stems=entity_content_stems,
                 content_snippet=content_snippet,
                 permalink=entity.permalink,
@@ -510,8 +519,8 @@ async def index_entity_markdown(
             seen_permalinks.add(obs_permalink)
 
             # Index with parent entity's file path since that's where it's defined
-            obs_content_stems = "\n".join(
-                p for p in self._generate_variants(obs.content) if p and p.strip()
+            obs_content_stems = _strip_nul(
+                "\n".join(p for p in self._generate_variants(obs.content) if p and p.strip())
             )
             # Truncate to stay under Postgres's 8KB index row limit
             if len(obs_content_stems) > MAX_CONTENT_STEMS_SIZE:  # pragma: no cover
@@ -520,9 +529,9 @@ async def index_entity_markdown(
                 SearchIndexRow(
                     id=obs.id,
                     type=SearchItemType.OBSERVATION.value,
-                    title=f"{obs.category}: {obs.content[:100]}...",
+                    title=_strip_nul(f"{obs.category}: {obs.content[:100]}..."),
                     content_stems=obs_content_stems,
-                    content_snippet=obs.content,
+                    content_snippet=_strip_nul(obs.content),
                     permalink=obs_permalink,
                     file_path=entity.file_path,
                     category=obs.category,
@@ -539,14 +548,14 @@ async def index_entity_markdown(
         # Add relation rows (only outgoing relations defined in this file)
         for rel in entity.outgoing_relations:
             # Create descriptive title showing the relationship
-            relation_title = (
+            relation_title = _strip_nul(
                 f"{rel.from_entity.title} → {rel.to_entity.title}"
                 if rel.to_entity
                 else f"{rel.from_entity.title}"
             )
 
-            rel_content_stems = "\n".join(
-                p for p in self._generate_variants(relation_title) if p and p.strip()
+            rel_content_stems = _strip_nul(
+                "\n".join(p for p in self._generate_variants(relation_title) if p and p.strip())
             )
             rows_to_index.append(
                 SearchIndexRow(
diff --git a/tests/mcp/test_tool_contracts.py b/tests/mcp/test_tool_contracts.py
@@ -42,6 +42,7 @@
     "move_note": [
         "identifier",
         "destination_path",
+        "destination_folder",
         "is_directory",
         "project",
         "workspace",
diff --git a/tests/repository/test_postgres_search_repository.py b/tests/repository/test_postgres_search_repository.py
@@ -11,7 +11,10 @@
 
 from basic_memory import db
 from basic_memory.config import BasicMemoryConfig, DatabaseBackend
-from basic_memory.repository.postgres_search_repository import PostgresSearchRepository
+from basic_memory.repository.postgres_search_repository import (
+    PostgresSearchRepository,
+    _strip_nul_from_row,
+)
 from basic_memory.repository.semantic_errors import SemanticSearchDisabledError
 from basic_memory.repository.search_index_row import SearchIndexRow
 from basic_memory.schemas.search import SearchItemType, SearchRetrievalMode
@@ -237,6 +240,76 @@ async def test_postgres_search_repository_reraises_non_tsquery_db_errors(
         await repo.search(permalink="docs/anything")
 
 
+@pytest.mark.asyncio
+async def test_bulk_index_items_strips_nul_bytes(session_maker, test_project):
+    """NUL bytes in content must not cause CharacterNotInRepertoireError on INSERT."""
+    repo = PostgresSearchRepository(session_maker, project_id=test_project.id)
+    now = datetime.now(timezone.utc)
+    row = SearchIndexRow(
+        project_id=test_project.id,
+        id=99,
+        title="hello\x00world",
+        content_stems="some\x00stems",
+        content_snippet="snippet\x00here",
+        permalink="test/nul-row",
+        file_path="test/nul.md",
+        type="entity",
+        metadata={"entity_type": "note"},
+        created_at=now,
+        updated_at=now,
+    )
+    # Should not raise CharacterNotInRepertoireError
+    await repo.bulk_index_items([row])
+    results = await repo.search(permalink="test/nul-row")
+    assert len(results) == 1
+    assert "\x00" not in (results[0].content_snippet or "")
+    assert "\x00" not in (results[0].title or "")
+
+
+@pytest.mark.asyncio
+async def test_index_item_strips_nul_bytes(session_maker, test_project):
+    """NUL bytes in single-item index_item path must not cause CharacterNotInRepertoireError."""
+    repo = PostgresSearchRepository(session_maker, project_id=test_project.id)
+    now = datetime.now(timezone.utc)
+    row = SearchIndexRow(
+        project_id=test_project.id,
+        id=98,
+        title="single\x00item",
+        content_stems="nul\x00stems",
+        content_snippet="nul\x00snippet",
+        permalink="test/nul-single",
+        file_path="test/nul-single.md",
+        type="entity",
+        metadata={"entity_type": "note"},
+        created_at=now,
+        updated_at=now,
+    )
+    await repo.index_item(row)
+    results = await repo.search(permalink="test/nul-single")
+    assert len(results) == 1
+    assert "\x00" not in (results[0].content_snippet or "")
+    assert "\x00" not in (results[0].title or "")
+
+
+def test_strip_nul_from_row():
+    """_strip_nul_from_row strips NUL bytes from string values, leaves non-strings alone."""
+    row = {
+        "title": "hello\x00world",
+        "content_stems": "some\x00content\x00here",
+        "content_snippet": "clean",
+        "id": 42,
+        "metadata": None,
+        "created_at": datetime(2024, 1, 1),
+    }
+    result = _strip_nul_from_row(row)
+    assert result["title"] == "helloworld"
+    assert result["content_stems"] == "somecontenthere"
+    assert result["content_snippet"] == "clean"
+    assert result["id"] == 42
+    assert result["metadata"] is None
+    assert result["created_at"] == datetime(2024, 1, 1)
+
+
 @pytest.mark.asyncio
 async def test_postgres_semantic_vector_search_returns_ranked_entities(session_maker, test_project):
     """Vector mode ranks entities via pgvector distance."""
diff --git a/tests/services/test_search_service.py b/tests/services/test_search_service.py
@@ -8,6 +8,7 @@
 from basic_memory import db
 from basic_memory.repository.search_index_row import SearchIndexRow
 from basic_memory.schemas.search import SearchQuery, SearchItemType, SearchRetrievalMode
+from basic_memory.services.search_service import _strip_nul
 
 
 @pytest.mark.asyncio
@@ -1116,6 +1117,60 @@ async def test_index_entity_multiple_categories_same_content(
     assert len(results) >= 2
 
 
+# Tests for NUL byte stripping
+
+
+def test_strip_nul_removes_nul_bytes():
+    """_strip_nul removes \\x00 from strings."""
+    assert _strip_nul("hello\x00world") == "helloworld"
+    assert _strip_nul("\x00\x00\x00") == ""
+    assert _strip_nul("clean string") == "clean string"
+
+
+@pytest.mark.asyncio
+async def test_index_entity_markdown_strips_nul_bytes(search_service, session_maker, test_project):
+    """Content with NUL bytes should be stripped before indexing.
+
+    rclone preallocation on virtual filesystems (e.g. Google Drive File Stream)
+    can pad files with \\x00 bytes, causing PostgreSQL CharacterNotInRepertoireError.
+
+    Note: NUL bytes arrive via file content read from disk, not from the database.
+    Postgres rejects \\x00 in text columns at the ORM level, so we only test
+    the content path (passed to index_entity) rather than observation creation.
+    """
+    from basic_memory.repository import EntityRepository
+    from basic_memory.repository.search_repository import SearchRepository
+
+    entity_repo = EntityRepository(session_maker, project_id=test_project.id)
+
+    entity_data = {
+        "title": "NUL Test Entity",
+        "entity_type": "note",
+        "entity_metadata": {},
+        "content_type": "text/markdown",
+        "file_path": "test/nul-test.md",
+        "permalink": "test/nul-test",
+        "project_id": test_project.id,
+        "created_at": datetime.now(),
+        "updated_at": datetime.now(),
+    }
+    entity = await entity_repo.create(entity_data)
+    entity = await entity_repo.get_by_permalink("test/nul-test")
+
+    # Index with NUL-containing content (simulates rclone-preallocated file)
+    nul_content = "# NUL Test\x00\x00\nSome content\x00here"
+    await search_service.index_entity(entity, content=nul_content)
+
+    # Verify no NUL bytes in stored search index rows
+    search_repo: SearchRepository = search_service.repository
+    results = await search_repo.search(permalink_match="test/nul-test*")
+    for row in results:
+        if row.content_snippet:
+            assert "\x00" not in row.content_snippet, (
+                f"NUL found in content_snippet for {row.permalink}"
+            )
+
+
 @pytest.mark.asyncio
 async def test_reindex_vectors(search_service, session_maker, test_project):
     """Test that reindex_vectors processes all entities and reports stats."""
diff --git a/tests/test_rclone_commands.py b/tests/test_rclone_commands.py