Skip to content

Commit 2d612c5

Browse files
committed
Revert "Merge pull request #10 from Developers-RCCS/Ncode01/enhance-answer-generation-1"
This reverts commit ddde36b, reversing changes made to 9ec8193.
1 parent f3a3690 commit 2d612c5

3 files changed

Lines changed: 41 additions & 228 deletions

File tree

agents/context_expander.py

Lines changed: 40 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -1,99 +1,65 @@
11
# agents/context_expander.py
2-
import logging
3-
import spacy
2+
import logging # Added import
43
from .base import BaseAgent
54
from utils.chunk_utils import filter_redundant_chunks
65

7-
logger = logging.getLogger(__name__)
6+
logger = logging.getLogger(__name__) # Get a logger for this module
87

98
class ContextExpansionAgent(BaseAgent):
109
"""Agent responsible for assessing and expanding retrieval context."""
1110

12-
def __init__(self):
13-
super().__init__()
14-
try:
15-
self.nlp = spacy.load("en_core_web_sm")
16-
logger.info("✅ spaCy model 'en_core_web_sm' loaded successfully.")
17-
except OSError:
18-
logger.error("❌ Error loading spaCy model 'en_core_web_sm'. Please run: python -m spacy download en_core_web_sm")
19-
self.nlp = None
20-
21-
self.feedback = defaultdict(list)
22-
23-
def assess(self, retrieved_chunks: list[dict], query_analysis: dict) -> dict:
11+
def assess(self, retrieved_chunks: list[dict]) -> dict:
2412
"""Assess if retrieved context is sufficient."""
2513
print("🧐 Assessing context sufficiency...")
26-
14+
2715
if not retrieved_chunks:
2816
print("⚠️ Assessment: No chunks retrieved, expansion needed.")
2917
return {"needs_expansion": True, "reason": "No chunks retrieved"}
30-
18+
3119
# Check confidence of top chunks
3220
confidences = [chunk.get("confidence", 0) for chunk in retrieved_chunks]
3321
avg_confidence = sum(confidences) / len(confidences)
3422
top_confidence = confidences[0] if confidences else 0
35-
23+
3624
# Calculate context coverage
3725
total_text_length = sum(len(chunk["text"]) for chunk in retrieved_chunks)
38-
26+
3927
# Check if we have entities from query in the chunks
40-
keywords = query_analysis.get("keywords", [])
41-
entities = query_analysis.get("entities", [])
42-
search_terms = set([k.lower() for k in keywords] + [e.lower() for e in entities])
43-
logger.debug(f"Checking context relevance. Search terms: {search_terms}")
44-
45-
if not search_terms:
46-
logger.debug("No keywords/entities found in query analysis, assuming context is relevant.")
47-
return {"needs_expansion": False, "reason": "No keywords/entities found"}
48-
49-
found_relevant_chunk = False
50-
for i, chunk in enumerate(retrieved_chunks):
51-
text_lower = chunk.get("text", "").lower()
52-
for term in search_terms:
53-
if re.search(r'\b' + re.escape(term) + r'\b', text_lower):
54-
logger.debug(f"Found relevant term '{term}' in context chunk {i+1}.")
55-
found_relevant_chunk = True
56-
break
57-
if found_relevant_chunk:
58-
break
59-
60-
if not found_relevant_chunk:
61-
logger.warning("No relevant terms found in any context chunk.")
62-
return {"needs_expansion": True, "reason": "No relevant terms found"}
63-
28+
# This would be populated from query_analysis
29+
6430
# Decision logic
6531
if top_confidence < 0.4:
6632
print(f"⚠️ Assessment: Low top confidence ({top_confidence:.2f}), expansion needed.")
6733
return {"needs_expansion": True, "reason": "Low confidence"}
68-
34+
6935
if avg_confidence < 0.3:
7036
print(f"⚠️ Assessment: Low average confidence ({avg_confidence:.2f}), expansion needed.")
7137
return {"needs_expansion": True, "reason": "Low average confidence"}
72-
38+
7339
if total_text_length < 500:
7440
print(f"⚠️ Assessment: Short context ({total_text_length} chars), expansion needed.")
7541
return {"needs_expansion": True, "reason": "Short context"}
76-
42+
7743
print(f"✅ Assessment: Context sufficient (Avg conf: {avg_confidence:.2f}, Length: {total_text_length} chars)")
7844
return {"needs_expansion": False, "reason": "Sufficient confidence and context"}
7945

8046
def find_contextual_chunks(self, chunks, retriever, max_additional=3):
8147
"""Find chunks that might be contextually related to the given chunks."""
8248
if not chunks:
8349
return []
84-
50+
8551
# Strategy 1: Find adjacent chunks by page numbers
8652
pages = [chunk["metadata"].get("page", 0) for chunk in chunks if "metadata" in chunk]
8753
adjacent_pages = set()
88-
54+
8955
for page in pages:
9056
if page > 0:
9157
adjacent_pages.add(page - 1) # Previous page
9258
adjacent_pages.add(page + 1) # Next page
93-
59+
9460
# Filter out pages we already have
9561
adjacent_pages = adjacent_pages - set(pages)
96-
62+
9763
# Find chunks from adjacent pages
9864
adjacent_chunks = []
9965
for i, metadata in enumerate(retriever.metadatas):
@@ -104,47 +70,26 @@ def find_contextual_chunks(self, chunks, retriever, max_additional=3):
10470
"confidence": 0.4, # Lower confidence for adjacent chunks
10571
"expansion_method": "adjacent_page"
10672
})
107-
73+
10874
# Strategy 2: Find chunks from same sections
10975
sections = [chunk["metadata"].get("section", "") for chunk in chunks if "metadata" in chunk]
11076
sections = [s for s in sections if s] # Remove empty sections
111-
77+
11278
section_chunks = []
11379
if sections:
11480
for i, metadata in enumerate(retriever.metadatas):
11581
if metadata.get("section", "") in sections:
11682
# Skip if we already have this chunk
11783
if any(retriever.texts[i] == c["text"] for c in chunks + adjacent_chunks):
11884
continue
119-
85+
12086
section_chunks.append({
12187
"text": retriever.texts[i],
12288
"metadata": metadata,
12389
"confidence": 0.35, # Lower confidence for section-based chunks
12490
"expansion_method": "same_section"
12591
})
126-
127-
# Strategy 3: Use advanced NLP techniques to find related chunks
128-
if self.nlp:
129-
for chunk in chunks:
130-
doc = self.nlp(chunk["text"].lower())
131-
chunk_entities = [ent.text.lower() for ent in doc.ents]
132-
chunk_keywords = [token.text.lower() for token in doc if token.dep_ in ("nsubj", "dobj", "pobj")]
133-
134-
for i, metadata in enumerate(retriever.metadatas):
135-
text_lower = retriever.texts[i].lower()
136-
doc = self.nlp(text_lower)
137-
entities = [ent.text.lower() for ent in doc.ents]
138-
keywords = [token.text.lower() for token in doc if token.dep_ in ("nsubj", "dobj", "pobj")]
139-
140-
if any(term in entities or term in keywords for term in chunk_entities + chunk_keywords):
141-
section_chunks.append({
142-
"text": retriever.texts[i],
143-
"metadata": metadata,
144-
"confidence": 0.3, # Lower confidence for NLP-based chunks
145-
"expansion_method": "nlp_related"
146-
})
147-
92+
14893
# Combine and limit additional chunks
14994
additional_chunks = (adjacent_chunks + section_chunks)[:max_additional]
15095
print(f"✅ Found {len(additional_chunks)} additional context chunks.")
@@ -153,98 +98,90 @@ def find_contextual_chunks(self, chunks, retriever, max_additional=3):
15398
def fuse_chunks(self, chunks):
15499
"""Fuse chunks into a coherent context, managing token limits."""
155100
print("🧩 Fusing chunks into coherent context...")
156-
101+
157102
# Sort chunks by confidence
158103
sorted_chunks = sorted(chunks, key=lambda x: x.get("confidence", 0), reverse=True)
159-
104+
160105
# Get metadata for organization
161106
chunk_metadata = []
162107
for chunk in sorted_chunks:
163108
page = chunk["metadata"].get("page", "Unknown")
164109
section = chunk["metadata"].get("section", "Unknown")
165110
chunk_metadata.append(f"[Page {page}, Section: {section}]")
166-
111+
167112
# Combine text with metadata headers
168113
fused_text = ""
169114
for i, chunk in enumerate(sorted_chunks):
170115
fused_text += f"\n\n--- Excerpt {i+1}: {chunk_metadata[i]} ---\n\n"
171116
fused_text += chunk["text"]
172-
117+
173118
print(f"✅ Fused {len(sorted_chunks)} chunks into coherent context.")
174119
return fused_text
175120

176121
def aggregate_metadata(self, chunks: list[dict]) -> dict:
177122
"""Aggregate metadata from all chunks."""
178123
print("📊 Aggregating metadata...")
179-
124+
180125
# Extract page numbers
181126
pages = set()
182127
sections = set()
183-
128+
184129
for chunk in chunks:
185130
metadata = chunk.get("metadata", {})
186131
if "page" in metadata and metadata["page"]:
187132
pages.add(metadata["page"])
188133
if "section" in metadata and metadata["section"]:
189134
sections.add(metadata["section"])
190-
135+
191136
aggregated = {
192137
"pages": sorted(list(pages)),
193138
"sections": sorted(list(sections))
194139
}
195-
140+
196141
print(f"✅ Metadata aggregated: {len(pages)} pages, {len(sections)} sections")
197142
return aggregated
198143

199-
def _update_feedback(self, query: str, context_chunks: list[dict], relevance: bool):
200-
"""Update feedback loop with user interaction data."""
201-
self.feedback[query].append({
202-
"context_chunks": context_chunks,
203-
"relevance": relevance
204-
})
205-
logger.debug(f"Feedback updated for query: '{query}' with relevance: {relevance}")
206-
207144
def run(self, retrieved_chunks: list[dict], query_analysis: dict, retriever_agent) -> tuple[list[dict], dict]:
208145
"""Assess context, expand if needed, filter redundancy, and fuse chunks."""
209146
logger.debug(f"Running context expansion/filtering on {len(retrieved_chunks)} chunks.")
210147
# 1. Assess if the context is sufficient
211-
assessment = self.assess(retrieved_chunks, query_analysis)
212-
148+
assessment = self.assess(retrieved_chunks)
149+
213150
final_chunks = retrieved_chunks.copy()
214-
151+
215152
# 2. Expand context if needed
216153
if assessment["needs_expansion"]:
217154
print(f"🔍 Expanding context due to: {assessment['reason']}")
218-
155+
219156
# If complex query, consider processing sub-queries separately
220157
if query_analysis.get("needs_decomposition", False):
221158
print("📋 Complex query detected, expanding context for multiple aspects.")
222159
# In a full implementation, we might retrieve for each sub-query
223160
# For now, just get related chunks to the current results
224-
161+
225162
# Find related chunks
226163
additional_chunks = self.find_contextual_chunks(
227-
retrieved_chunks,
164+
retrieved_chunks,
228165
retriever_agent
229166
)
230-
167+
231168
# Combine original and additional chunks
232169
expanded_chunks = retrieved_chunks + additional_chunks
233-
170+
234171
# 3. Filter redundant chunks using the utility function
235172
final_chunks = filter_redundant_chunks(expanded_chunks)
236-
173+
237174
print(f"✅ Context expansion complete: {len(final_chunks)} chunks after filtering.")
238175
else:
239176
print("✅ Original context is sufficient, no expansion needed.")
240177
# Still filter original chunks for redundancy
241178
final_chunks = filter_redundant_chunks(retrieved_chunks)
242-
179+
243180
# 4. Aggregate metadata from all included chunks
244181
aggregated_metadata = self.aggregate_metadata(final_chunks)
245-
182+
246183
logger.debug(f"Context expansion complete. Final chunks: {len(final_chunks)}")
247184
# Note: We don't actually fuse the chunks here - that will be handled by the generator
248185
# when it builds its prompt, using the separate chunks we provide
249-
186+
250187
return final_chunks, aggregated_metadata

agents/generator.py

Lines changed: 1 addition & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
from gemini_utils import setup_gemini
88
from utils.text_utils import post_process_answer, format_multi_part_answer
99
from config import Config
10-
import spacy
11-
from collections import defaultdict
1210

1311
logger = logging.getLogger(__name__)
1412

@@ -36,17 +34,6 @@ def __init__(self):
3634
logger.error(f"Failed to initialize Gemini model: {e}", exc_info=True)
3735
self.gemini = None
3836

39-
# Load spaCy model for NER and dependency parsing
40-
try:
41-
self.nlp = spacy.load("en_core_web_sm")
42-
logger.info("✅ spaCy model 'en_core_web_sm' loaded successfully.")
43-
except OSError:
44-
logger.error("❌ Error loading spaCy model 'en_core_web_sm'. Please run: python -m spacy download en_core_web_sm")
45-
self.nlp = None
46-
47-
# Initialize feedback storage
48-
self.feedback = defaultdict(list)
49-
5037
def _check_context_relevance(self, context_chunks: list[dict], query_analysis: dict) -> bool:
5138
"""Check if any context chunk contains keywords or entities from the query.
5239
@@ -84,49 +71,6 @@ def _check_context_relevance(self, context_chunks: list[dict], query_analysis: d
8471

8572
return found_relevant_chunk
8673

87-
def _enhanced_check_context_relevance(self, context_chunks: list[dict], query_analysis: dict) -> bool:
88-
"""Enhanced check for context relevance using NER and dependency parsing."""
89-
if not self.nlp:
90-
logger.warning("spaCy model not loaded, falling back to basic relevance check.")
91-
return self._check_context_relevance(context_chunks, query_analysis)
92-
93-
keywords = query_analysis.get("keywords", [])
94-
entities = query_analysis.get("entities", [])
95-
search_terms = set([k.lower() for k in keywords] + [e.lower() for e in entities])
96-
logger.debug(f"Enhanced checking context relevance. Search terms: {search_terms}")
97-
98-
if not search_terms:
99-
logger.debug("No keywords/entities found in query analysis, assuming context is relevant.")
100-
return True # If no terms to check, assume relevance or let LLM decide
101-
102-
found_relevant_chunk = False
103-
for i, chunk in enumerate(context_chunks):
104-
text_lower = chunk.get("text", "").lower()
105-
doc = self.nlp(text_lower)
106-
chunk_entities = [ent.text.lower() for ent in doc.ents]
107-
chunk_keywords = [token.text.lower() for token in doc if token.dep_ in ("nsubj", "dobj", "pobj")]
108-
109-
for term in search_terms:
110-
if term in chunk_entities or term in chunk_keywords:
111-
logger.debug(f"Found relevant term '{term}' in context chunk {i+1} using NER/Dependency Parsing.")
112-
found_relevant_chunk = True
113-
break # Found a relevant term in this chunk, move to next chunk if needed (though one is enough)
114-
if found_relevant_chunk:
115-
break # Found a relevant chunk, no need to check further
116-
117-
if not found_relevant_chunk:
118-
logger.warning("No relevant terms found in any context chunk using NER/Dependency Parsing.")
119-
120-
return found_relevant_chunk
121-
122-
def _update_feedback(self, query: str, context_chunks: list[dict], relevance: bool):
123-
"""Update feedback loop with user interaction data."""
124-
self.feedback[query].append({
125-
"context_chunks": context_chunks,
126-
"relevance": relevance
127-
})
128-
logger.debug(f"Feedback updated for query: '{query}' with relevance: {relevance}")
129-
13074
def create_prompt(self, query: str, context_chunks: list[dict], query_analysis: dict, chat_history: list = None) -> str:
13175
"""Create an effective prompt based on query type, context, and history."""
13276
if context_chunks and "confidence" in context_chunks[0]:
@@ -227,8 +171,7 @@ def run(self, query: str, context_chunks: list[dict], query_analysis: dict = Non
227171
logger.info(f"Fallback triggered: No context. Time: {time.time() - run_start_time:.4f}s")
228172
return fallback_message
229173

230-
is_relevant = self._enhanced_check_context_relevance(context_chunks, query_analysis)
231-
self._update_feedback(query, context_chunks, is_relevant)
174+
is_relevant = self._check_context_relevance(context_chunks, query_analysis)
232175
if not is_relevant:
233176
logger.warning(f"⚠️ Context relevance check failed. Keywords/Entities: {query_analysis.get('keywords', []) + query_analysis.get('entities', [])}")
234177
# Adhering to strict non-apology rule

0 commit comments

Comments
 (0)