Skip to content

Commit 8e57827

Browse files
committed
web search
1 parent fca79c3 commit 8e57827

10 files changed

Lines changed: 935 additions & 58 deletions

PROJECT_STATE.md

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Project State: Histronaut History Tutor (Web Search Enhancement)
2+
3+
## Current Implementation Status
4+
5+
The Histronaut History Tutor has been enhanced with a domain-restricted web search capability to supplement the existing textbook-based RAG system. This feature allows the system to retrieve information from competition-approved websites only, maintaining strict source control while expanding the knowledge base.
6+
7+
## Key Components Implemented
8+
9+
### 1. Domain Restriction Framework
10+
- Created `config/approved_domains.py` containing all competition-approved domains organized by topic categories
11+
- Implemented domain validation to ensure searches never go outside approved boundaries
12+
- Added topic categorization to map queries to the most relevant domain categories
13+
14+
### 2. WebSearchAgent
15+
- Implemented a dedicated agent for web search within approved domains only
16+
- Created robust caching mechanism to avoid redundant web requests
17+
- Added text extraction from HTML with proper chunking strategy aligned with textbook chunking
18+
- Implemented scoring and ranking for web search results
19+
20+
### 3. RAG Pipeline Integration
21+
- Updated OrchestratorAgent to intelligently combine textbook and web sources
22+
- Enhanced ContextExpansionAgent to handle different source types appropriately
23+
- Modified GeneratorAgent to properly cite and distinguish between textbook and web sources
24+
- Added source attribution and reference tracking for web content
25+
26+
### 4. Security and Error Handling
27+
- Implemented strict URL validation to prevent accidental requests to non-approved sites
28+
- Added comprehensive error handling for network issues and failed requests
29+
- Created fallback strategies for when web search fails or finds no relevant results
30+
31+
## Usage Flow
32+
33+
1. User submits a query to the history tutor
34+
2. QueryAnalyzerAgent analyzes the query for entities, keywords, and query type
35+
3. RetrieverAgent searches for relevant textbook content
36+
4. Based on query content and textbook results, OrchestratorAgent decides if web search is needed
37+
5. If needed, WebSearchAgent retrieves information from approved websites for the relevant topic
38+
6. Retrieved content from both sources is combined and processed by ContextExpansionAgent
39+
7. GeneratorAgent creates a comprehensive answer with proper citation of all sources
40+
8. Results are presented to the user with clear source attribution
41+
42+
## Next Steps
43+
44+
- Enhance topic mapping accuracy for better domain selection
45+
- Implement more advanced ranking for combined textbook and web sources
46+
- Add evaluation metrics to compare answers with and without web search capability
47+
- Create a visualization interface to show source distribution in responses
48+
49+
## Dependencies Added
50+
51+
- requests: For fetching web content
52+
- beautifulsoup4: For HTML parsing and content extraction
53+
- lxml: For efficient HTML parsing
54+
- urllib3: For URL handling and validation
55+
- cachetools: For efficient caching of web search results

__pycache__/config.cpython-312.pyc

0 Bytes
Binary file not shown.
0 Bytes
Binary file not shown.

agents/context_expander.py

Lines changed: 91 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ def assess(self, retrieved_chunks: list[dict]) -> dict:
2222
# Calculate context coverage
2323
total_text_length = sum(len(chunk["text"]) for chunk in retrieved_chunks)
2424

25-
# Check if we have entities from query in the chunks
26-
# This would be populated from query_analysis
25+
# Separate web and textbook chunks for assessment
26+
textbook_chunks = [c for c in retrieved_chunks if c["metadata"].get("source_type", "textbook") == "textbook"]
27+
web_chunks = [c for c in retrieved_chunks if c["metadata"].get("source_type") == "web"]
2728

2829
# Decision logic
2930
if top_confidence < 0.4:
@@ -37,17 +38,29 @@ def assess(self, retrieved_chunks: list[dict]) -> dict:
3738
if total_text_length < 500:
3839
print(f"⚠️ Assessment: Short context ({total_text_length} chars), expansion needed.")
3940
return {"needs_expansion": True, "reason": "Short context"}
41+
42+
# If we have web chunks but no textbook chunks, expand to try to get textbook context
43+
if web_chunks and not textbook_chunks:
44+
print("⚠️ Assessment: Only web chunks available, trying to find textbook content.")
45+
return {"needs_expansion": True, "reason": "Missing textbook content"}
4046

4147
print(f"✅ Assessment: Context sufficient (Avg conf: {avg_confidence:.2f}, Length: {total_text_length} chars)")
48+
print(f" Sources: {len(textbook_chunks)} textbook chunks, {len(web_chunks)} web chunks")
4249
return {"needs_expansion": False, "reason": "Sufficient confidence and context"}
4350

4451
def find_contextual_chunks(self, chunks, retriever, max_additional=3):
4552
"""Find chunks that might be contextually related to the given chunks."""
4653
if not chunks:
4754
return []
55+
56+
# Only expand textbook chunks using the retriever
57+
textbook_chunks = [c for c in chunks if c["metadata"].get("source_type", "textbook") == "textbook"]
58+
if not textbook_chunks:
59+
print("ℹ️ No textbook chunks to expand from.")
60+
return []
4861

4962
# Strategy 1: Find adjacent chunks by page numbers
50-
pages = [chunk["metadata"].get("page", 0) for chunk in chunks if "metadata" in chunk]
63+
pages = [chunk["metadata"].get("page", 0) for chunk in textbook_chunks if "metadata" in chunk]
5164
adjacent_pages = set()
5265

5366
for page in pages:
@@ -64,13 +77,13 @@ def find_contextual_chunks(self, chunks, retriever, max_additional=3):
6477
if metadata.get("page", 0) in adjacent_pages:
6578
adjacent_chunks.append({
6679
"text": retriever.texts[i],
67-
"metadata": metadata,
80+
"metadata": {**metadata, "source_type": "textbook"}, # Ensure source type is set
6881
"confidence": 0.4, # Lower confidence for adjacent chunks
6982
"expansion_method": "adjacent_page"
7083
})
7184

7285
# Strategy 2: Find chunks from same sections
73-
sections = [chunk["metadata"].get("section", "") for chunk in chunks if "metadata" in chunk]
86+
sections = [chunk["metadata"].get("section", "") for chunk in textbook_chunks if "metadata" in chunk]
7487
sections = [s for s in sections if s] # Remove empty sections
7588

7689
section_chunks = []
@@ -83,14 +96,14 @@ def find_contextual_chunks(self, chunks, retriever, max_additional=3):
8396

8497
section_chunks.append({
8598
"text": retriever.texts[i],
86-
"metadata": metadata,
99+
"metadata": {**metadata, "source_type": "textbook"}, # Ensure source type is set
87100
"confidence": 0.35, # Lower confidence for section-based chunks
88101
"expansion_method": "same_section"
89102
})
90103

91104
# Combine and limit additional chunks
92105
additional_chunks = (adjacent_chunks + section_chunks)[:max_additional]
93-
print(f"✅ Found {len(additional_chunks)} additional context chunks.")
106+
print(f"✅ Found {len(additional_chunks)} additional textbook context chunks.")
94107
return additional_chunks
95108

96109
def calculate_chunk_similarity(self, chunks):
@@ -123,6 +136,27 @@ def filter_redundant_chunks(self, chunks, similarity_threshold=0.85):
123136
if len(chunks) <= 1:
124137
return chunks
125138

139+
# Separate web and textbook chunks for redundancy filtering
140+
textbook_chunks = [c for c in chunks if c["metadata"].get("source_type", "textbook") == "textbook"]
141+
web_chunks = [c for c in chunks if c["metadata"].get("source_type") == "web"]
142+
143+
# Filter redundancy within each source type separately
144+
filtered_textbook = self._filter_source_redundancy(textbook_chunks, similarity_threshold)
145+
filtered_web = self._filter_source_redundancy(web_chunks, similarity_threshold)
146+
147+
# Combine filtered chunks
148+
filtered_chunks = filtered_textbook + filtered_web
149+
150+
print(f"✅ Filtered out {len(chunks) - len(filtered_chunks)} redundant chunks.")
151+
print(f" Remaining: {len(filtered_textbook)} textbook chunks, {len(filtered_web)} web chunks")
152+
153+
return filtered_chunks
154+
155+
def _filter_source_redundancy(self, chunks, similarity_threshold=0.85):
156+
"""Filter redundancy within a specific source type."""
157+
if len(chunks) <= 1:
158+
return chunks
159+
126160
similarities = self.calculate_chunk_similarity(chunks)
127161
chunks_to_remove = set()
128162

@@ -137,8 +171,6 @@ def filter_redundant_chunks(self, chunks, similarity_threshold=0.85):
137171

138172
# Create filtered list
139173
filtered_chunks = [chunk for i, chunk in enumerate(chunks) if i not in chunks_to_remove]
140-
141-
print(f"✅ Filtered out {len(chunks) - len(filtered_chunks)} redundant chunks.")
142174
return filtered_chunks
143175

144176
def fuse_chunks(self, chunks):
@@ -148,43 +180,72 @@ def fuse_chunks(self, chunks):
148180
# Sort chunks by confidence
149181
sorted_chunks = sorted(chunks, key=lambda x: x.get("confidence", 0), reverse=True)
150182

151-
# Get metadata for organization
152-
chunk_metadata = []
183+
# Separate chunks by source type
184+
textbook_chunks = []
185+
web_chunks = []
186+
153187
for chunk in sorted_chunks:
154-
page = chunk["metadata"].get("page", "Unknown")
155-
section = chunk["metadata"].get("section", "Unknown")
156-
chunk_metadata.append(f"[Page {page}, Section: {section}]")
157-
158-
# Combine text with metadata headers
159-
fused_text = ""
160-
for i, chunk in enumerate(sorted_chunks):
161-
fused_text += f"\n\n--- Excerpt {i+1}: {chunk_metadata[i]} ---\n\n"
162-
fused_text += chunk["text"]
163-
164-
print(f"✅ Fused {len(sorted_chunks)} chunks into coherent context.")
188+
source_type = chunk["metadata"].get("source_type", "textbook")
189+
if source_type == "web":
190+
web_chunks.append(chunk)
191+
else:
192+
textbook_chunks.append(chunk)
193+
194+
# Fuse textbook chunks
195+
textbook_fused = ""
196+
if textbook_chunks:
197+
textbook_fused = "**Textbook Content:**"
198+
for i, chunk in enumerate(textbook_chunks):
199+
page = chunk["metadata"].get("page", "Unknown")
200+
section = chunk["metadata"].get("section", "Unknown")
201+
textbook_fused += f"\n\n--- Excerpt {i+1}: [Page {page}, Section: {section}] ---\n\n"
202+
textbook_fused += chunk["text"]
203+
204+
# Fuse web chunks
205+
web_fused = ""
206+
if web_chunks:
207+
web_fused = "\n\n**Web Content:**"
208+
for i, chunk in enumerate(web_chunks):
209+
url = chunk["metadata"].get("url", "Unknown Source")
210+
web_fused += f"\n\n--- Web Excerpt {i+1}: [Source: {url}] ---\n\n"
211+
web_fused += chunk["text"]
212+
213+
# Combine fused text
214+
fused_text = textbook_fused + web_fused
215+
216+
print(f"✅ Fused {len(textbook_chunks)} textbook chunks and {len(web_chunks)} web chunks into coherent context.")
165217
return fused_text
166218

167219
def aggregate_metadata(self, chunks: list[dict]) -> dict:
168220
"""Aggregate metadata from all chunks."""
169221
print("📊 Aggregating metadata...")
170222

171-
# Extract page numbers
223+
# Extract metadata by source type
172224
pages = set()
173225
sections = set()
226+
web_sources = set()
174227

175228
for chunk in chunks:
176229
metadata = chunk.get("metadata", {})
177-
if "page" in metadata and metadata["page"]:
178-
pages.add(metadata["page"])
179-
if "section" in metadata and metadata["section"]:
180-
sections.add(metadata["section"])
230+
source_type = metadata.get("source_type", "textbook")
231+
232+
if source_type == "web":
233+
if "url" in metadata:
234+
web_sources.add(metadata["url"])
235+
else:
236+
if "page" in metadata and metadata["page"]:
237+
pages.add(metadata["page"])
238+
if "section" in metadata and metadata["section"]:
239+
sections.add(metadata["section"])
181240

241+
# Create aggregated metadata dictionary
182242
aggregated = {
183243
"pages": sorted(list(pages)),
184-
"sections": sorted(list(sections))
244+
"sections": sorted(list(sections)),
245+
"web_sources": sorted(list(web_sources))
185246
}
186247

187-
print(f"✅ Metadata aggregated: {len(pages)} pages, {len(sections)} sections")
248+
print(f"✅ Metadata aggregated: {len(pages)} pages, {len(sections)} sections, {len(web_sources)} web sources")
188249
return aggregated
189250

190251
def run(self, retrieved_chunks: list[dict], query_analysis: dict, retriever_agent) -> tuple[list[dict], dict]:
@@ -204,7 +265,7 @@ def run(self, retrieved_chunks: list[dict], query_analysis: dict, retriever_agen
204265
# In a full implementation, we might retrieve for each sub-query
205266
# For now, just get related chunks to the current results
206267

207-
# Find related chunks
268+
# Find related chunks (only expands textbook chunks)
208269
additional_chunks = self.find_contextual_chunks(
209270
retrieved_chunks,
210271
retriever_agent

0 commit comments

Comments
 (0)