Skip to content

Commit 546e3cd

Browse files
phernandezclaude[bot]claude
authored
fix: handle Boolean search syntax with hyphenated terms (#180)
Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> Co-authored-by: Paul Hernandez <phernandez@users.noreply.github.com> Co-authored-by: Claude <noreply@anthropic.com>
1 parent de4737c commit 546e3cd

2 files changed

Lines changed: 143 additions & 26 deletions

File tree

src/basic_memory/repository/search_repository.py

Lines changed: 128 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Repository for search operations."""
22

33
import json
4+
import re
45
import time
56
from dataclasses import dataclass
67
from datetime import datetime
@@ -120,23 +121,113 @@ async def init_search_index(self):
120121
logger.error(f"Error initializing search index: {e}")
121122
raise e
122123

123-
def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
124-
"""Prepare a search term for FTS5 query.
125-
124+
def _prepare_boolean_query(self, query: str) -> str:
125+
"""Prepare a Boolean query by quoting individual terms while preserving operators.
126+
126127
Args:
127-
term: The search term to prepare
128+
query: A Boolean query like "tier1-test AND unicode" or "(hello OR world) NOT test"
129+
130+
Returns:
131+
A properly formatted Boolean query with quoted terms that need quoting
132+
"""
133+
# Define Boolean operators and their boundaries
134+
boolean_pattern = r'(\bAND\b|\bOR\b|\bNOT\b)'
135+
136+
# Split the query by Boolean operators, keeping the operators
137+
parts = re.split(boolean_pattern, query)
138+
139+
processed_parts = []
140+
for part in parts:
141+
part = part.strip()
142+
if not part:
143+
continue
144+
145+
# If it's a Boolean operator, keep it as is
146+
if part in ['AND', 'OR', 'NOT']:
147+
processed_parts.append(part)
148+
else:
149+
# Handle parentheses specially - they should be preserved for grouping
150+
if '(' in part or ')' in part:
151+
# Parse parenthetical expressions carefully
152+
processed_part = self._prepare_parenthetical_term(part)
153+
processed_parts.append(processed_part)
154+
else:
155+
# This is a search term - for Boolean queries, don't add prefix wildcards
156+
prepared_term = self._prepare_single_term(part, is_prefix=False)
157+
processed_parts.append(prepared_term)
158+
159+
return " ".join(processed_parts)
160+
161+
def _prepare_parenthetical_term(self, term: str) -> str:
162+
"""Prepare a term that contains parentheses, preserving the parentheses for grouping.
163+
164+
Args:
165+
term: A term that may contain parentheses like "(hello" or "world)" or "(hello OR world)"
166+
167+
Returns:
168+
A properly formatted term with parentheses preserved
169+
"""
170+
# Handle terms that start/end with parentheses but may contain quotable content
171+
result = ""
172+
i = 0
173+
while i < len(term):
174+
if term[i] in '()':
175+
# Preserve parentheses as-is
176+
result += term[i]
177+
i += 1
178+
else:
179+
# Find the next parenthesis or end of string
180+
start = i
181+
while i < len(term) and term[i] not in '()':
182+
i += 1
183+
184+
# Extract the content between parentheses
185+
content = term[start:i].strip()
186+
if content:
187+
# Only quote if it actually needs quoting (has hyphens, special chars, etc)
188+
# but don't quote if it's just simple words
189+
if self._needs_quoting(content):
190+
escaped_content = content.replace('"', '""')
191+
result += f'"{escaped_content}"'
192+
else:
193+
result += content
194+
195+
return result
196+
197+
def _needs_quoting(self, term: str) -> bool:
198+
"""Check if a term needs to be quoted for FTS5 safety.
199+
200+
Args:
201+
term: The term to check
202+
203+
Returns:
204+
True if the term should be quoted
205+
"""
206+
if not term or not term.strip():
207+
return False
208+
209+
# Characters that indicate we should quote (excluding parentheses which are valid syntax)
210+
needs_quoting_chars = [" ", ".", ":", ";", ",", "<", ">", "?", "/", "-", "'", '"',
211+
"[", "]", "{", "}", "+", "!", "@", "#", "$", "%", "^", "&",
212+
"=", "|", "\\", "~", "`"]
213+
214+
return any(c in term for c in needs_quoting_chars)
215+
216+
def _prepare_single_term(self, term: str, is_prefix: bool = True) -> str:
217+
"""Prepare a single search term (no Boolean operators).
218+
219+
Args:
220+
term: A single search term
128221
is_prefix: Whether to add prefix search capability (* suffix)
129-
130-
For FTS5:
131-
- Boolean operators (AND, OR, NOT) are preserved for complex queries
132-
- Terms with FTS5 special characters are quoted to prevent syntax errors
133-
- Simple terms get prefix wildcards for better matching
222+
223+
Returns:
224+
A properly formatted single term
134225
"""
135-
# Check for explicit boolean operators - if present, return the term as is
136-
boolean_operators = [" AND ", " OR ", " NOT "]
137-
if any(op in f" {term} " for op in boolean_operators):
226+
if not term or not term.strip():
138227
return term
139-
228+
229+
term = term.strip()
230+
140231
# Check if term is already a proper wildcard pattern (alphanumeric + *)
141232
# e.g., "hello*", "test*world" - these should be left alone
142233
if "*" in term and all(c.isalnum() or c in "*_-" for c in term):
@@ -218,6 +309,26 @@ def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
218309

219310
return term
220311

312+
def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
313+
"""Prepare a search term for FTS5 query.
314+
315+
Args:
316+
term: The search term to prepare
317+
is_prefix: Whether to add prefix search capability (* suffix)
318+
319+
For FTS5:
320+
- Boolean operators (AND, OR, NOT) are preserved for complex queries
321+
- Terms with FTS5 special characters are quoted to prevent syntax errors
322+
- Simple terms get prefix wildcards for better matching
323+
"""
324+
# Check for explicit boolean operators - if present, process as Boolean query
325+
boolean_operators = [" AND ", " OR ", " NOT "]
326+
if any(op in f" {term} " for op in boolean_operators):
327+
return self._prepare_boolean_query(term)
328+
329+
# For non-Boolean queries, use the single term preparation logic
330+
return self._prepare_single_term(term, is_prefix)
331+
221332
async def search(
222333
self,
223334
search_text: Optional[str] = None,
@@ -242,19 +353,10 @@ async def search(
242353
# For wildcard searches, don't add any text conditions - return all results
243354
pass
244355
else:
245-
# Check for explicit boolean operators - only detect them in proper boolean contexts
246-
has_boolean = any(op in f" {search_text} " for op in [" AND ", " OR ", " NOT "])
247-
248-
if has_boolean:
249-
# If boolean operators are present, use the raw query
250-
# No need to prepare it, FTS5 will understand the operators
251-
params["text"] = search_text
252-
conditions.append("(title MATCH :text OR content_stems MATCH :text)")
253-
else:
254-
# Standard search with term preparation
255-
processed_text = self._prepare_search_term(search_text.strip())
256-
params["text"] = processed_text
257-
conditions.append("(title MATCH :text OR content_stems MATCH :text)")
356+
# Use _prepare_search_term to handle both Boolean and non-Boolean queries
357+
processed_text = self._prepare_search_term(search_text.strip())
358+
params["text"] = processed_text
359+
conditions.append("(title MATCH :text OR content_stems MATCH :text)")
258360

259361
# Handle title match search
260362
if title:

tests/repository/test_search_repository.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,21 @@ def test_boolean_operators_preserved(self, search_repository):
329329
== "(hello AND world) OR test"
330330
)
331331

332+
def test_hyphenated_terms_with_boolean_operators(self, search_repository):
333+
"""Hyphenated terms with Boolean operators should be properly quoted."""
334+
# Test the specific case from the GitHub issue
335+
result = search_repository._prepare_search_term("tier1-test AND unicode")
336+
assert result == '"tier1-test" AND unicode'
337+
338+
# Test other hyphenated Boolean combinations
339+
assert search_repository._prepare_search_term("multi-word OR single") == '"multi-word" OR single'
340+
assert search_repository._prepare_search_term("well-formed NOT badly-formed") == '"well-formed" NOT "badly-formed"'
341+
assert search_repository._prepare_search_term("test-case AND (hello OR world)") == '"test-case" AND (hello OR world)'
342+
343+
# Test mixed special characters with Boolean operators
344+
assert search_repository._prepare_search_term("config.json AND test-file") == '"config.json" AND "test-file"'
345+
assert search_repository._prepare_search_term("C++ OR python-script") == '"C++" OR "python-script"'
346+
332347
def test_programming_terms_should_work(self, search_repository):
333348
"""Programming-related terms with special chars should be searchable."""
334349
# These should be quoted to handle special characters safely

0 commit comments

Comments
 (0)