11"""Repository for search operations."""
22
33import json
4+ import re
45import time
56from dataclasses import dataclass
67from datetime import datetime
@@ -120,23 +121,113 @@ async def init_search_index(self):
120121 logger .error (f"Error initializing search index: { e } " )
121122 raise e
122123
123- def _prepare_search_term (self , term : str , is_prefix : bool = True ) -> str :
124- """Prepare a search term for FTS5 query .
125-
124+ def _prepare_boolean_query (self , query : str ) -> str :
125+ """Prepare a Boolean query by quoting individual terms while preserving operators .
126+
126127 Args:
127- term: The search term to prepare
128+ query: A Boolean query like "tier1-test AND unicode" or "(hello OR world) NOT test"
129+
130+ Returns:
131+ A properly formatted Boolean query with quoted terms that need quoting
132+ """
133+ # Define Boolean operators and their boundaries
134+ boolean_pattern = r'(\bAND\b|\bOR\b|\bNOT\b)'
135+
136+ # Split the query by Boolean operators, keeping the operators
137+ parts = re .split (boolean_pattern , query )
138+
139+ processed_parts = []
140+ for part in parts :
141+ part = part .strip ()
142+ if not part :
143+ continue
144+
145+ # If it's a Boolean operator, keep it as is
146+ if part in ['AND' , 'OR' , 'NOT' ]:
147+ processed_parts .append (part )
148+ else :
149+ # Handle parentheses specially - they should be preserved for grouping
150+ if '(' in part or ')' in part :
151+ # Parse parenthetical expressions carefully
152+ processed_part = self ._prepare_parenthetical_term (part )
153+ processed_parts .append (processed_part )
154+ else :
155+ # This is a search term - for Boolean queries, don't add prefix wildcards
156+ prepared_term = self ._prepare_single_term (part , is_prefix = False )
157+ processed_parts .append (prepared_term )
158+
159+ return " " .join (processed_parts )
160+
161+ def _prepare_parenthetical_term (self , term : str ) -> str :
162+ """Prepare a term that contains parentheses, preserving the parentheses for grouping.
163+
164+ Args:
165+ term: A term that may contain parentheses like "(hello" or "world)" or "(hello OR world)"
166+
167+ Returns:
168+ A properly formatted term with parentheses preserved
169+ """
170+ # Handle terms that start/end with parentheses but may contain quotable content
171+ result = ""
172+ i = 0
173+ while i < len (term ):
174+ if term [i ] in '()' :
175+ # Preserve parentheses as-is
176+ result += term [i ]
177+ i += 1
178+ else :
179+ # Find the next parenthesis or end of string
180+ start = i
181+ while i < len (term ) and term [i ] not in '()' :
182+ i += 1
183+
184+ # Extract the content between parentheses
185+ content = term [start :i ].strip ()
186+ if content :
187+ # Only quote if it actually needs quoting (has hyphens, special chars, etc)
188+ # but don't quote if it's just simple words
189+ if self ._needs_quoting (content ):
190+ escaped_content = content .replace ('"' , '""' )
191+ result += f'"{ escaped_content } "'
192+ else :
193+ result += content
194+
195+ return result
196+
197+ def _needs_quoting (self , term : str ) -> bool :
198+ """Check if a term needs to be quoted for FTS5 safety.
199+
200+ Args:
201+ term: The term to check
202+
203+ Returns:
204+ True if the term should be quoted
205+ """
206+ if not term or not term .strip ():
207+ return False
208+
209+ # Characters that indicate we should quote (excluding parentheses which are valid syntax)
210+ needs_quoting_chars = [" " , "." , ":" , ";" , "," , "<" , ">" , "?" , "/" , "-" , "'" , '"' ,
211+ "[" , "]" , "{" , "}" , "+" , "!" , "@" , "#" , "$" , "%" , "^" , "&" ,
212+ "=" , "|" , "\\ " , "~" , "`" ]
213+
214+ return any (c in term for c in needs_quoting_chars )
215+
216+ def _prepare_single_term (self , term : str , is_prefix : bool = True ) -> str :
217+ """Prepare a single search term (no Boolean operators).
218+
219+ Args:
220+ term: A single search term
128221 is_prefix: Whether to add prefix search capability (* suffix)
129-
130- For FTS5:
131- - Boolean operators (AND, OR, NOT) are preserved for complex queries
132- - Terms with FTS5 special characters are quoted to prevent syntax errors
133- - Simple terms get prefix wildcards for better matching
222+
223+ Returns:
224+ A properly formatted single term
134225 """
135- # Check for explicit boolean operators - if present, return the term as is
136- boolean_operators = [" AND " , " OR " , " NOT " ]
137- if any (op in f" { term } " for op in boolean_operators ):
226+ if not term or not term .strip ():
138227 return term
139-
228+
229+ term = term .strip ()
230+
140231 # Check if term is already a proper wildcard pattern (alphanumeric + *)
141232 # e.g., "hello*", "test*world" - these should be left alone
142233 if "*" in term and all (c .isalnum () or c in "*_-" for c in term ):
@@ -218,6 +309,26 @@ def _prepare_search_term(self, term: str, is_prefix: bool = True) -> str:
218309
219310 return term
220311
312+ def _prepare_search_term (self , term : str , is_prefix : bool = True ) -> str :
313+ """Prepare a search term for FTS5 query.
314+
315+ Args:
316+ term: The search term to prepare
317+ is_prefix: Whether to add prefix search capability (* suffix)
318+
319+ For FTS5:
320+ - Boolean operators (AND, OR, NOT) are preserved for complex queries
321+ - Terms with FTS5 special characters are quoted to prevent syntax errors
322+ - Simple terms get prefix wildcards for better matching
323+ """
324+ # Check for explicit boolean operators - if present, process as Boolean query
325+ boolean_operators = [" AND " , " OR " , " NOT " ]
326+ if any (op in f" { term } " for op in boolean_operators ):
327+ return self ._prepare_boolean_query (term )
328+
329+ # For non-Boolean queries, use the single term preparation logic
330+ return self ._prepare_single_term (term , is_prefix )
331+
221332 async def search (
222333 self ,
223334 search_text : Optional [str ] = None ,
@@ -242,19 +353,10 @@ async def search(
242353 # For wildcard searches, don't add any text conditions - return all results
243354 pass
244355 else :
245- # Check for explicit boolean operators - only detect them in proper boolean contexts
246- has_boolean = any (op in f" { search_text } " for op in [" AND " , " OR " , " NOT " ])
247-
248- if has_boolean :
249- # If boolean operators are present, use the raw query
250- # No need to prepare it, FTS5 will understand the operators
251- params ["text" ] = search_text
252- conditions .append ("(title MATCH :text OR content_stems MATCH :text)" )
253- else :
254- # Standard search with term preparation
255- processed_text = self ._prepare_search_term (search_text .strip ())
256- params ["text" ] = processed_text
257- conditions .append ("(title MATCH :text OR content_stems MATCH :text)" )
356+ # Use _prepare_search_term to handle both Boolean and non-Boolean queries
357+ processed_text = self ._prepare_search_term (search_text .strip ())
358+ params ["text" ] = processed_text
359+ conditions .append ("(title MATCH :text OR content_stems MATCH :text)" )
258360
259361 # Handle title match search
260362 if title :
0 commit comments