11# agents/context_expander.py
2- import logging
3- import spacy
2+ import logging # Added import
43from .base import BaseAgent
54from utils .chunk_utils import filter_redundant_chunks
65
7- logger = logging .getLogger (__name__ )
6+ logger = logging .getLogger (__name__ ) # Get a logger for this module
87
98class ContextExpansionAgent (BaseAgent ):
109 """Agent responsible for assessing and expanding retrieval context."""
1110
12- def __init__ (self ):
13- super ().__init__ ()
14- try :
15- self .nlp = spacy .load ("en_core_web_sm" )
16- logger .info ("✅ spaCy model 'en_core_web_sm' loaded successfully." )
17- except OSError :
18- logger .error ("❌ Error loading spaCy model 'en_core_web_sm'. Please run: python -m spacy download en_core_web_sm" )
19- self .nlp = None
20-
21- self .feedback = defaultdict (list )
22-
23- def assess (self , retrieved_chunks : list [dict ], query_analysis : dict ) -> dict :
11+ def assess (self , retrieved_chunks : list [dict ]) -> dict :
2412 """Assess if retrieved context is sufficient."""
2513 print ("🧐 Assessing context sufficiency..." )
26-
14+
2715 if not retrieved_chunks :
2816 print ("⚠️ Assessment: No chunks retrieved, expansion needed." )
2917 return {"needs_expansion" : True , "reason" : "No chunks retrieved" }
30-
18+
3119 # Check confidence of top chunks
3220 confidences = [chunk .get ("confidence" , 0 ) for chunk in retrieved_chunks ]
3321 avg_confidence = sum (confidences ) / len (confidences )
3422 top_confidence = confidences [0 ] if confidences else 0
35-
23+
3624 # Calculate context coverage
3725 total_text_length = sum (len (chunk ["text" ]) for chunk in retrieved_chunks )
38-
26+
3927 # Check if we have entities from query in the chunks
40- keywords = query_analysis .get ("keywords" , [])
41- entities = query_analysis .get ("entities" , [])
42- search_terms = set ([k .lower () for k in keywords ] + [e .lower () for e in entities ])
43- logger .debug (f"Checking context relevance. Search terms: { search_terms } " )
44-
45- if not search_terms :
46- logger .debug ("No keywords/entities found in query analysis, assuming context is relevant." )
47- return {"needs_expansion" : False , "reason" : "No keywords/entities found" }
48-
49- found_relevant_chunk = False
50- for i , chunk in enumerate (retrieved_chunks ):
51- text_lower = chunk .get ("text" , "" ).lower ()
52- for term in search_terms :
53- if re .search (r'\b' + re .escape (term ) + r'\b' , text_lower ):
54- logger .debug (f"Found relevant term '{ term } ' in context chunk { i + 1 } ." )
55- found_relevant_chunk = True
56- break
57- if found_relevant_chunk :
58- break
59-
60- if not found_relevant_chunk :
61- logger .warning ("No relevant terms found in any context chunk." )
62- return {"needs_expansion" : True , "reason" : "No relevant terms found" }
63-
28+ # This would be populated from query_analysis
29+
6430 # Decision logic
6531 if top_confidence < 0.4 :
6632 print (f"⚠️ Assessment: Low top confidence ({ top_confidence :.2f} ), expansion needed." )
6733 return {"needs_expansion" : True , "reason" : "Low confidence" }
68-
34+
6935 if avg_confidence < 0.3 :
7036 print (f"⚠️ Assessment: Low average confidence ({ avg_confidence :.2f} ), expansion needed." )
7137 return {"needs_expansion" : True , "reason" : "Low average confidence" }
72-
38+
7339 if total_text_length < 500 :
7440 print (f"⚠️ Assessment: Short context ({ total_text_length } chars), expansion needed." )
7541 return {"needs_expansion" : True , "reason" : "Short context" }
76-
42+
7743 print (f"✅ Assessment: Context sufficient (Avg conf: { avg_confidence :.2f} , Length: { total_text_length } chars)" )
7844 return {"needs_expansion" : False , "reason" : "Sufficient confidence and context" }
7945
8046 def find_contextual_chunks (self , chunks , retriever , max_additional = 3 ):
8147 """Find chunks that might be contextually related to the given chunks."""
8248 if not chunks :
8349 return []
84-
50+
8551 # Strategy 1: Find adjacent chunks by page numbers
8652 pages = [chunk ["metadata" ].get ("page" , 0 ) for chunk in chunks if "metadata" in chunk ]
8753 adjacent_pages = set ()
88-
54+
8955 for page in pages :
9056 if page > 0 :
9157 adjacent_pages .add (page - 1 ) # Previous page
9258 adjacent_pages .add (page + 1 ) # Next page
93-
59+
9460 # Filter out pages we already have
9561 adjacent_pages = adjacent_pages - set (pages )
96-
62+
9763 # Find chunks from adjacent pages
9864 adjacent_chunks = []
9965 for i , metadata in enumerate (retriever .metadatas ):
@@ -104,47 +70,26 @@ def find_contextual_chunks(self, chunks, retriever, max_additional=3):
10470 "confidence" : 0.4 , # Lower confidence for adjacent chunks
10571 "expansion_method" : "adjacent_page"
10672 })
107-
73+
10874 # Strategy 2: Find chunks from same sections
10975 sections = [chunk ["metadata" ].get ("section" , "" ) for chunk in chunks if "metadata" in chunk ]
11076 sections = [s for s in sections if s ] # Remove empty sections
111-
77+
11278 section_chunks = []
11379 if sections :
11480 for i , metadata in enumerate (retriever .metadatas ):
11581 if metadata .get ("section" , "" ) in sections :
11682 # Skip if we already have this chunk
11783 if any (retriever .texts [i ] == c ["text" ] for c in chunks + adjacent_chunks ):
11884 continue
119-
85+
12086 section_chunks .append ({
12187 "text" : retriever .texts [i ],
12288 "metadata" : metadata ,
12389 "confidence" : 0.35 , # Lower confidence for section-based chunks
12490 "expansion_method" : "same_section"
12591 })
126-
127- # Strategy 3: Use advanced NLP techniques to find related chunks
128- if self .nlp :
129- for chunk in chunks :
130- doc = self .nlp (chunk ["text" ].lower ())
131- chunk_entities = [ent .text .lower () for ent in doc .ents ]
132- chunk_keywords = [token .text .lower () for token in doc if token .dep_ in ("nsubj" , "dobj" , "pobj" )]
133-
134- for i , metadata in enumerate (retriever .metadatas ):
135- text_lower = retriever .texts [i ].lower ()
136- doc = self .nlp (text_lower )
137- entities = [ent .text .lower () for ent in doc .ents ]
138- keywords = [token .text .lower () for token in doc if token .dep_ in ("nsubj" , "dobj" , "pobj" )]
139-
140- if any (term in entities or term in keywords for term in chunk_entities + chunk_keywords ):
141- section_chunks .append ({
142- "text" : retriever .texts [i ],
143- "metadata" : metadata ,
144- "confidence" : 0.3 , # Lower confidence for NLP-based chunks
145- "expansion_method" : "nlp_related"
146- })
147-
92+
14893 # Combine and limit additional chunks
14994 additional_chunks = (adjacent_chunks + section_chunks )[:max_additional ]
15095 print (f"✅ Found { len (additional_chunks )} additional context chunks." )
@@ -153,98 +98,90 @@ def find_contextual_chunks(self, chunks, retriever, max_additional=3):
15398 def fuse_chunks (self , chunks ):
15499 """Fuse chunks into a coherent context, managing token limits."""
155100 print ("🧩 Fusing chunks into coherent context..." )
156-
101+
157102 # Sort chunks by confidence
158103 sorted_chunks = sorted (chunks , key = lambda x : x .get ("confidence" , 0 ), reverse = True )
159-
104+
160105 # Get metadata for organization
161106 chunk_metadata = []
162107 for chunk in sorted_chunks :
163108 page = chunk ["metadata" ].get ("page" , "Unknown" )
164109 section = chunk ["metadata" ].get ("section" , "Unknown" )
165110 chunk_metadata .append (f"[Page { page } , Section: { section } ]" )
166-
111+
167112 # Combine text with metadata headers
168113 fused_text = ""
169114 for i , chunk in enumerate (sorted_chunks ):
170115 fused_text += f"\n \n --- Excerpt { i + 1 } : { chunk_metadata [i ]} ---\n \n "
171116 fused_text += chunk ["text" ]
172-
117+
173118 print (f"✅ Fused { len (sorted_chunks )} chunks into coherent context." )
174119 return fused_text
175120
176121 def aggregate_metadata (self , chunks : list [dict ]) -> dict :
177122 """Aggregate metadata from all chunks."""
178123 print ("📊 Aggregating metadata..." )
179-
124+
180125 # Extract page numbers
181126 pages = set ()
182127 sections = set ()
183-
128+
184129 for chunk in chunks :
185130 metadata = chunk .get ("metadata" , {})
186131 if "page" in metadata and metadata ["page" ]:
187132 pages .add (metadata ["page" ])
188133 if "section" in metadata and metadata ["section" ]:
189134 sections .add (metadata ["section" ])
190-
135+
191136 aggregated = {
192137 "pages" : sorted (list (pages )),
193138 "sections" : sorted (list (sections ))
194139 }
195-
140+
196141 print (f"✅ Metadata aggregated: { len (pages )} pages, { len (sections )} sections" )
197142 return aggregated
198143
199- def _update_feedback (self , query : str , context_chunks : list [dict ], relevance : bool ):
200- """Update feedback loop with user interaction data."""
201- self .feedback [query ].append ({
202- "context_chunks" : context_chunks ,
203- "relevance" : relevance
204- })
205- logger .debug (f"Feedback updated for query: '{ query } ' with relevance: { relevance } " )
206-
207144 def run (self , retrieved_chunks : list [dict ], query_analysis : dict , retriever_agent ) -> tuple [list [dict ], dict ]:
208145 """Assess context, expand if needed, filter redundancy, and fuse chunks."""
209146 logger .debug (f"Running context expansion/filtering on { len (retrieved_chunks )} chunks." )
210147 # 1. Assess if the context is sufficient
211- assessment = self .assess (retrieved_chunks , query_analysis )
212-
148+ assessment = self .assess (retrieved_chunks )
149+
213150 final_chunks = retrieved_chunks .copy ()
214-
151+
215152 # 2. Expand context if needed
216153 if assessment ["needs_expansion" ]:
217154 print (f"🔍 Expanding context due to: { assessment ['reason' ]} " )
218-
155+
219156 # If complex query, consider processing sub-queries separately
220157 if query_analysis .get ("needs_decomposition" , False ):
221158 print ("📋 Complex query detected, expanding context for multiple aspects." )
222159 # In a full implementation, we might retrieve for each sub-query
223160 # For now, just get related chunks to the current results
224-
161+
225162 # Find related chunks
226163 additional_chunks = self .find_contextual_chunks (
227- retrieved_chunks ,
164+ retrieved_chunks ,
228165 retriever_agent
229166 )
230-
167+
231168 # Combine original and additional chunks
232169 expanded_chunks = retrieved_chunks + additional_chunks
233-
170+
234171 # 3. Filter redundant chunks using the utility function
235172 final_chunks = filter_redundant_chunks (expanded_chunks )
236-
173+
237174 print (f"✅ Context expansion complete: { len (final_chunks )} chunks after filtering." )
238175 else :
239176 print ("✅ Original context is sufficient, no expansion needed." )
240177 # Still filter original chunks for redundancy
241178 final_chunks = filter_redundant_chunks (retrieved_chunks )
242-
179+
243180 # 4. Aggregate metadata from all included chunks
244181 aggregated_metadata = self .aggregate_metadata (final_chunks )
245-
182+
246183 logger .debug (f"Context expansion complete. Final chunks: { len (final_chunks )} " )
247184 # Note: We don't actually fuse the chunks here - that will be handled by the generator
248185 # when it builds its prompt, using the separate chunks we provide
249-
186+
250187 return final_chunks , aggregated_metadata
0 commit comments