fix: use upsert to prevent IntegrityError during parallel search indexing

phernandez · claude · phernandez · commit 4ce21984a484 · 2025-12-30T21:55:43.000-06:00
Replace delete-then-insert pattern with INSERT ... ON CONFLICT for PostgreSQL search index operations. This fixes race conditions where parallel entity indexing could cause UniqueViolationError on the uix_search_index_permalink_project constraint. Changes: - Add index_item() override in PostgresSearchRepository with upsert - Update bulk_index_items() to use ON CONFLICT (permalink, project_id) - Add CREATE_POSTGRES_SEARCH_INDEX_PERMALINK DDL for test fixtures - Add tests for upsert behavior on duplicate permalinks Technical note: Use column-based ON CONFLICT syntax instead of ON CONSTRAINT (which only works for table constraints, not indexes). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Signed-off-by: phernandez <paul@basicmachines.co>
diff --git a/src/basic_memory/models/search.py b/src/basic_memory/models/search.py
@@ -48,6 +48,15 @@
 CREATE INDEX IF NOT EXISTS idx_search_index_metadata_gin ON search_index USING gin(metadata jsonb_path_ops)
 """)
 
+# Partial unique index on (permalink, project_id) for non-null permalinks
+# This prevents duplicate permalinks per project and is used by upsert operations
+# in PostgresSearchRepository to handle race conditions during parallel indexing
+CREATE_POSTGRES_SEARCH_INDEX_PERMALINK = DDL("""
+CREATE UNIQUE INDEX IF NOT EXISTS uix_search_index_permalink_project
+ON search_index (permalink, project_id)
+WHERE permalink IS NOT NULL
+""")
+
 # Define FTS5 virtual table creation for SQLite only
 # This DDL is executed separately for SQLite databases
 CREATE_SEARCH_INDEX = DDL("""
diff --git a/src/basic_memory/repository/postgres_search_repository.py b/src/basic_memory/repository/postgres_search_repository.py
@@ -24,6 +24,11 @@ class PostgresSearchRepository(SearchRepositoryBase):
     - GIN indexes for performance
     - ts_rank() function for relevance scoring
     - JSONB containment operators for metadata search
+
+    Note: This implementation uses UPSERT patterns (INSERT ... ON CONFLICT) instead of
+    delete-then-insert to handle race conditions during parallel entity indexing.
+    The partial unique index uix_search_index_permalink_project prevents duplicate
+    permalinks per project.
     """
 
     async def init_search_index(self):
@@ -41,6 +46,63 @@ async def init_search_index(self):
         # - CREATE INDEX USING GIN on metadata jsonb_path_ops
         pass
 
+    async def index_item(self, search_index_row: SearchIndexRow) -> None:
+        """Index or update a single item using UPSERT.
+
+        Uses INSERT ... ON CONFLICT to handle race conditions during parallel
+        entity indexing. The partial unique index uix_search_index_permalink_project
+        on (permalink, project_id) WHERE permalink IS NOT NULL prevents duplicate
+        permalinks.
+
+        For rows with non-null permalinks (entities), conflicts are resolved by
+        updating the existing row. For rows with null permalinks, no conflict
+        occurs on this index.
+        """
+        async with db.scoped_session(self.session_maker) as session:
+            # Serialize JSON for raw SQL
+            insert_data = search_index_row.to_insert(serialize_json=True)
+            insert_data["project_id"] = self.project_id
+
+            # Use upsert to handle race conditions during parallel indexing
+            # ON CONFLICT (permalink, project_id) matches the partial unique index
+            # uix_search_index_permalink_project WHERE permalink IS NOT NULL
+            # For rows with NULL permalinks, no conflict occurs (partial index doesn't apply)
+            await session.execute(
+                text("""
+                    INSERT INTO search_index (
+                        id, title, content_stems, content_snippet, permalink, file_path, type, metadata,
+                        from_id, to_id, relation_type,
+                        entity_id, category,
+                        created_at, updated_at,
+                        project_id
+                    ) VALUES (
+                        :id, :title, :content_stems, :content_snippet, :permalink, :file_path, :type, :metadata,
+                        :from_id, :to_id, :relation_type,
+                        :entity_id, :category,
+                        :created_at, :updated_at,
+                        :project_id
+                    )
+                    ON CONFLICT (permalink, project_id) WHERE permalink IS NOT NULL DO UPDATE SET
+                        id = EXCLUDED.id,
+                        title = EXCLUDED.title,
+                        content_stems = EXCLUDED.content_stems,
+                        content_snippet = EXCLUDED.content_snippet,
+                        file_path = EXCLUDED.file_path,
+                        type = EXCLUDED.type,
+                        metadata = EXCLUDED.metadata,
+                        from_id = EXCLUDED.from_id,
+                        to_id = EXCLUDED.to_id,
+                        relation_type = EXCLUDED.relation_type,
+                        entity_id = EXCLUDED.entity_id,
+                        category = EXCLUDED.category,
+                        created_at = EXCLUDED.created_at,
+                        updated_at = EXCLUDED.updated_at
+                """),
+                insert_data,
+            )
+            logger.debug(f"indexed row {search_index_row}")
+            await session.commit()
+
     def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
         """Prepare a search term for tsquery format.
 
@@ -316,10 +378,14 @@ async def search(
     async def bulk_index_items(self, search_index_rows: List[SearchIndexRow]) -> None:
         """Index multiple items in a single batch operation using UPSERT.
 
-        Uses INSERT ... ON CONFLICT DO UPDATE to handle re-indexing of existing
-        entities (e.g., during forward reference resolution) without requiring
-        a separate delete operation. This eliminates race conditions between
-        delete and insert operations in separate transactions.
+        Uses INSERT ... ON CONFLICT to handle race conditions during parallel
+        entity indexing. The partial unique index uix_search_index_permalink_project
+        on (permalink, project_id) WHERE permalink IS NOT NULL prevents duplicate
+        permalinks.
+
+        For rows with non-null permalinks (entities), conflicts are resolved by
+        updating the existing row. For rows with null permalinks (observations,
+        relations), the partial index doesn't apply and they are inserted directly.
 
         Args:
             search_index_rows: List of SearchIndexRow objects to index
@@ -338,11 +404,10 @@ async def bulk_index_items(self, search_index_rows: List[SearchIndexRow]) -> Non
                 insert_data["project_id"] = self.project_id
                 insert_data_list.append(insert_data)
 
-            # Use UPSERT (INSERT ... ON CONFLICT) to handle re-indexing
-            # Primary key is (id, type, project_id)
-            # This handles race conditions during forward reference resolution
-            # where an entity might be re-indexed before the delete commits
-            # Syntax works for both SQLite 3.24+ and PostgreSQL
+            # Use upsert to handle race conditions during parallel indexing
+            # ON CONFLICT (permalink, project_id) matches the partial unique index
+            # uix_search_index_permalink_project WHERE permalink IS NOT NULL
+            # For rows with NULL permalinks (observations, relations), no conflict occurs
             await session.execute(
                 text("""
                     INSERT INTO search_index (
@@ -358,12 +423,13 @@ async def bulk_index_items(self, search_index_rows: List[SearchIndexRow]) -> Non
                         :created_at, :updated_at,
                         :project_id
                     )
-                    ON CONFLICT (id, type, project_id) DO UPDATE SET
+                    ON CONFLICT (permalink, project_id) WHERE permalink IS NOT NULL DO UPDATE SET
+                        id = EXCLUDED.id,
                         title = EXCLUDED.title,
                         content_stems = EXCLUDED.content_stems,
                         content_snippet = EXCLUDED.content_snippet,
-                        permalink = EXCLUDED.permalink,
                         file_path = EXCLUDED.file_path,
+                        type = EXCLUDED.type,
                         metadata = EXCLUDED.metadata,
                         from_id = EXCLUDED.from_id,
                         to_id = EXCLUDED.to_id,
diff --git a/test-int/conftest.py b/test-int/conftest.py
@@ -125,6 +125,7 @@ async def engine_factory(
         CREATE_POSTGRES_SEARCH_INDEX_TABLE,
         CREATE_POSTGRES_SEARCH_INDEX_FTS,
         CREATE_POSTGRES_SEARCH_INDEX_METADATA,
+        CREATE_POSTGRES_SEARCH_INDEX_PERMALINK,
     )
     from basic_memory import db
 
@@ -160,6 +161,7 @@ async def engine_factory(
             await conn.execute(CREATE_POSTGRES_SEARCH_INDEX_TABLE)
             await conn.execute(CREATE_POSTGRES_SEARCH_INDEX_FTS)
             await conn.execute(CREATE_POSTGRES_SEARCH_INDEX_METADATA)
+            await conn.execute(CREATE_POSTGRES_SEARCH_INDEX_PERMALINK)
 
         yield engine, session_maker
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -210,6 +210,7 @@ async def engine_factory(
             CREATE_POSTGRES_SEARCH_INDEX_TABLE,
             CREATE_POSTGRES_SEARCH_INDEX_FTS,
             CREATE_POSTGRES_SEARCH_INDEX_METADATA,
+            CREATE_POSTGRES_SEARCH_INDEX_PERMALINK,
         )
 
         # Drop and recreate all tables for test isolation
@@ -223,6 +224,7 @@ async def engine_factory(
             await conn.execute(CREATE_POSTGRES_SEARCH_INDEX_TABLE)
             await conn.execute(CREATE_POSTGRES_SEARCH_INDEX_FTS)
             await conn.execute(CREATE_POSTGRES_SEARCH_INDEX_METADATA)
+            await conn.execute(CREATE_POSTGRES_SEARCH_INDEX_PERMALINK)
 
         yield engine, session_maker
 
diff --git a/tests/repository/test_search_repository.py b/tests/repository/test_search_repository.py
@@ -138,6 +138,116 @@ async def test_index_item(search_repository, search_entity):
     assert results[0].project_id == search_repository.project_id
 
 
+@pytest.mark.asyncio
+async def test_index_item_upsert_on_duplicate_permalink(search_repository, search_entity):
+    """Test that indexing the same permalink twice uses upsert instead of failing.
+
+    This tests the fix for the race condition where parallel entity indexing
+    could cause IntegrityError on the unique permalink constraint.
+    """
+    # First insert
+    search_row1 = SearchIndexRow(
+        id=search_entity.id,
+        type=SearchItemType.ENTITY.value,
+        title="Original Title",
+        content_stems="original content",
+        content_snippet="Original content snippet",
+        permalink=search_entity.permalink,
+        file_path=search_entity.file_path,
+        entity_id=search_entity.id,
+        metadata={"entity_type": search_entity.entity_type},
+        created_at=search_entity.created_at,
+        updated_at=search_entity.updated_at,
+        project_id=search_repository.project_id,
+    )
+    await search_repository.index_item(search_row1)
+
+    # Verify first insert worked
+    results = await search_repository.search(search_text="original")
+    assert len(results) == 1
+    assert results[0].title == "Original Title"
+
+    # Second insert with same permalink but different content (simulates race condition)
+    # This should NOT raise IntegrityError - it should upsert (update) instead
+    search_row2 = SearchIndexRow(
+        id=search_entity.id,
+        type=SearchItemType.ENTITY.value,
+        title="Updated Title",
+        content_stems="updated content",
+        content_snippet="Updated content snippet",
+        permalink=search_entity.permalink,  # Same permalink!
+        file_path=search_entity.file_path,
+        entity_id=search_entity.id,
+        metadata={"entity_type": search_entity.entity_type},
+        created_at=search_entity.created_at,
+        updated_at=search_entity.updated_at,
+        project_id=search_repository.project_id,
+    )
+    # This should succeed without raising IntegrityError
+    await search_repository.index_item(search_row2)
+
+    # Verify the row was updated, not duplicated
+    results_after = await search_repository.search(search_text="updated")
+    assert len(results_after) == 1
+    assert results_after[0].title == "Updated Title"
+
+    # Verify old content is gone (was replaced)
+    results_old = await search_repository.search(search_text="original")
+    assert len(results_old) == 0
+
+
+@pytest.mark.asyncio
+async def test_bulk_index_items_upsert_on_duplicate_permalink(search_repository, search_entity):
+    """Test that bulk_index_items uses upsert for duplicate permalinks.
+
+    This tests the fix for race conditions during bulk entity indexing.
+    """
+    # First bulk insert
+    search_row1 = SearchIndexRow(
+        id=search_entity.id,
+        type=SearchItemType.ENTITY.value,
+        title="Bulk Original Title",
+        content_stems="bulk original content",
+        content_snippet="Bulk original content snippet",
+        permalink=search_entity.permalink,
+        file_path=search_entity.file_path,
+        entity_id=search_entity.id,
+        metadata={"entity_type": search_entity.entity_type},
+        created_at=search_entity.created_at,
+        updated_at=search_entity.updated_at,
+        project_id=search_repository.project_id,
+    )
+    await search_repository.bulk_index_items([search_row1])
+
+    # Verify first insert worked
+    results = await search_repository.search(search_text="bulk original")
+    assert len(results) == 1
+    assert results[0].title == "Bulk Original Title"
+
+    # Second bulk insert with same permalink (simulates race condition)
+    search_row2 = SearchIndexRow(
+        id=search_entity.id,
+        type=SearchItemType.ENTITY.value,
+        title="Bulk Updated Title",
+        content_stems="bulk updated content",
+        content_snippet="Bulk updated content snippet",
+        permalink=search_entity.permalink,  # Same permalink!
+        file_path=search_entity.file_path,
+        entity_id=search_entity.id,
+        metadata={"entity_type": search_entity.entity_type},
+        created_at=search_entity.created_at,
+        updated_at=search_entity.updated_at,
+        project_id=search_repository.project_id,
+    )
+    # This should succeed without raising IntegrityError
+    await search_repository.bulk_index_items([search_row2])
+
+    # Verify the row was updated
+    results_after = await search_repository.search(search_text="bulk updated")
+    assert len(results_after) == 1
+    assert results_after[0].title == "Bulk Updated Title"
+
+
 @pytest.mark.asyncio
 async def test_project_isolation(
     search_repository, second_project_repository, search_entity, second_entity