Developers-RCCS
diff --git a/‎agents/__pycache__/query_analyzer.cpython-312.pyc‎
3.74 KB b/‎agents/__pycache__/query_analyzer.cpython-312.pyc‎
3.74 KB
diff --git a/‎agents/query_analyzer.py‎
Lines changed: 242 additions & 71 deletions b/‎agents/query_analyzer.py‎
Lines changed: 242 additions & 71 deletions
@@ -1,74 +1,245 @@
-# agents/query_analyzer.py
-import spacy
-import logging # Added import
-from .base import BaseAgent
-import re
+import logging
 import time
+import faiss
+import pickle
+import numpy as np
+import itertools
+import re # Import re
+from .base import BaseAgent
+from gemini_utils import embed_text
+from utils.text_utils import simple_keyword_score, simple_entity_score, section_relevance_score
+from config import Config
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_HYBRID_INITIAL_TOP_K = Config.RETRIEVER_INITIAL_K
+DEFAULT_HYBRID_FINAL_TOP_K = Config.RETRIEVER_FINAL_K
+
+class RetrieverAgent(BaseAgent):
+    """Agent responsible for retrieving and re-ranking relevant text chunks."""
+    def __init__(self, index_path="faiss_index.index", metadata_path="faiss_metadata.pkl"):
+        logger.info(f"💾 Loading FAISS index from: {index_path}")
+        try:
+            self.index = faiss.read_index(index_path)
+            logger.info(f"✅ FAISS index loaded successfully. Index dimension: {self.index.d}, Total vectors: {self.index.ntotal}")
+        except Exception as e:
+            logger.error(f"❌ Failed to load FAISS index: {e}", exc_info=True)
+            raise
+        logger.info(f"💾 Loading metadata from: {metadata_path}")
+        try:
+            with open(metadata_path, "rb") as f:
+                self.metadatas = pickle.load(f)
+            # Pre-extract texts for faster access if needed elsewhere
+            self.texts = [m.pop('text', '') for m in self.metadatas] # Extract text and remove from metadata dict
+            logger.info(f"✅ Metadata loaded successfully. Number of entries: {len(self.metadatas)}")
+            if len(self.metadatas) != self.index.ntotal:
+                 logger.warning(f"⚠️ Mismatch between index size ({self.index.ntotal}) and metadata count ({len(self.metadatas)}).")
+        except Exception as e:
+            logger.error(f"❌ Failed to load metadata: {e}", exc_info=True)
+            raise
 
-# Load the spaCy model once when the class is instantiated
-try:
-    nlp = spacy.load("en_core_web_sm")
-    print("✅ spaCy model 'en_core_web_sm' loaded successfully.")
-except OSError:
-    print("❌ Error loading spaCy model 'en_core_web_sm'.")
-    print("   Please run: python -m spacy download en_core_web_sm")
-    nlp = None # Set nlp to None if loading fails
-
-logger = logging.getLogger(__name__) # Get a logger for this module
-
-class QueryAnalyzerAgent(BaseAgent):
-    """Agent responsible for analyzing the user query."""
-    def run(self, query: str, chat_history: list = None) -> dict: # Add chat_history parameter
-        start_time = time.time()
-        logger.debug(f"Analyzing query: '{query}' with history: {chat_history is not None}") # Log if history is present
-        if not nlp:
-            logger.warning("spaCy model not loaded, falling back to basic analysis.")
-            # Fallback basic extraction (similar to previous web.py logic)
-            keywords = re.findall(r'"(.*?)"|\b[A-Z][a-zA-Z]+\b', query)
-            entities = re.findall(r'\b[A-Z][a-zA-Z]+(?:\s+[A-Z][a-zA-Z]+)*\b', query)
-            keywords = list(set([k.strip().lower() for k in keywords if k]))
-            entities = list(set([e.strip() for e in entities if len(e.split()) > 1 or e in keywords]))
+    def re_rank_chunks(self, initial_results, query, query_analysis):
+        """Re-rank chunks based on multiple factors using utility functions."""
+        rerank_start_time = time.time()
+        logger.info("⚖️ Re-ranking retrieved chunks...")
+        if not initial_results:
+            logger.warning("No initial results to re-rank.")
+            return []
+
+        keywords = query_analysis.get("keywords", [])
+        entities = query_analysis.get("entities", [])
+        query_type = query_analysis.get("query_type", "unknown")
+        intent_type = query_analysis.get("intent_type", "new_topic") # Get intent
+        topic_keywords = query_analysis.get("topic_keywords", []) # Get topic keywords
+        topic_entities = query_analysis.get("topic_entities", []) # Get topic entities
+
+        query_keywords_set = set(keywords)
+        topic_terms_set = set(topic_keywords + topic_entities) # Combine topic terms
+
+        logger.debug(f"Re-ranking based on -> Query Keywords: {keywords}, Entities: {entities}, Type: {query_type}, Intent: {intent_type}, Topic Terms: {topic_terms_set}")
+
+        # --- Tuned Weights ---
+        # Adjust weights based on intent? (Example)
+        if intent_type in ["follow_up", "clarification"] and topic_terms_set:
+             logger.debug("Adjusting weights for follow-up/clarification intent.")
+             weights = {
+                 "semantic": 0.15, # Slightly lower semantic weight for current query
+                 "keyword": 0.4, # Keep keyword weight
+                 "entity": 0.25, # Keep entity weight
+                 "topic": 0.2, # Add weight for topic relevance
+                 "section": 0.0
+             }
         else:
-            # TODO: Incorporate chat_history into spaCy analysis if needed
-            # For now, just process the current query
-            doc = nlp(query)
-            
-            # Extract Named Entities (GPE, PERSON, ORG, LOC, EVENT, DATE etc.)
-            entities = list(set([ent.text.strip() for ent in doc.ents if ent.label_ in ["GPE", "PERSON", "ORG", "LOC", "EVENT", "DATE", "FAC", "PRODUCT", "WORK_OF_ART"]]))
-            
-            # Extract Keywords (Noun chunks and Proper Nouns)
-            keywords = list(set([chunk.text.lower().strip() for chunk in doc.noun_chunks]))
-            # Add proper nouns that might not be part of chunks or recognized entities
-            keywords.extend([token.text.lower().strip() for token in doc if token.pos_ == "PROPN" and token.text not in entities])
-            # Remove duplicates that might exist between entities and keywords after lowercasing
-            keywords = list(set(keywords)) 
-            # Optional: Remove very short keywords if needed
-            # keywords = [kw for kw in keywords if len(kw) > 2]
-
-        # Determine Query Type (Keep existing logic)
-        query_lower = query.lower()
-        query_type = "unknown"
-        if "cause" in query_lower or "why" in query_lower or "effect" in query_lower or "impact" in query_lower:
-            query_type = "causal/analytical"
-        elif "compare" in query_lower or "difference" in query_lower or "similar" in query_lower or "contrast" in query_lower:
-            query_type = "comparative"
-        elif re.match(r"^(what|who|when|where|which)\s+(is|was|are|were|did|do|does)\b", query_lower) or \
-             re.match(r"^(define|describe|explain|list)\b", query_lower):
-             query_type = "factual"
-        # Add more rules if needed
-
-        analysis = {
-            "original_query": query, # Add the original query here
-            "keywords": keywords,
-            "entities": entities,
-            "query_type": query_type,
-            # Optionally include history info if used
-            # "history_considered": chat_history is not None 
-        }
-        
-        end_time = time.time()
-        # Log the extracted information
-        logger.debug(f"Analysis Results: Keywords: {analysis['keywords']}, Entities: {analysis['entities']}, Query Type: {analysis['query_type']}")
-        logger.debug(f"Analysis Time: {end_time - start_time:.4f}s")
-        
-        return analysis
+             weights = {
+                 "semantic": 0.2,
+                 "keyword": 0.5,
+                 "entity": 0.3,
+                 "topic": 0.0, # No topic weight for new topics
+                 "section": 0.0
+             }
+        # ---------------------
+
+        # Normalize semantic scores (FAISS distances are lower for better matches)
+        max_faiss_dist = max(r["score"] for r in initial_results) if initial_results else 1.0
+        if max_faiss_dist <= 0:  # Avoid division by zero
+            max_faiss_dist = 1.0
+
+        logger.debug(f"Re-ranking {len(initial_results)} chunks...")
+        for i, result in enumerate(initial_results):
+            text_lower = self.texts[result["index"]].lower() # Get text using index
+            result["text"] = self.texts[result["index"]] # Add full text back for generator
+            result["metadata"] = self.metadatas[result["index"]] # Add metadata back
+
+            result["semantic_score"] = max(0.0, 1.0 - (max(0.0, result["score"]) / max_faiss_dist))
+            # Use utility functions for scoring
+            result["keyword_score"] = simple_keyword_score(text_lower, query_keywords_set)
+            result["entity_score"] = simple_entity_score(text_lower, entities)
+            result["section_score"] = section_relevance_score(result["metadata"], query_type)
+            # Add topic score if applicable
+            result["topic_score"] = simple_keyword_score(text_lower, topic_terms_set) if weights["topic"] > 0 else 0.0
+
+            combined_score = (
+                weights["semantic"] * result["semantic_score"] +
+                weights["keyword"] * result["keyword_score"] +
+                weights["entity"] * result["entity_score"] +
+                weights["topic"] * result["topic_score"] # Include topic score
+                # + weights["section"] * result["section_score"] # Section score currently unused
+            )
+            result["combined_score"] = combined_score
+
+            # Confidence calculation (can be refined)
+            if combined_score > 0.75:
+                confidence = 0.95
+            elif combined_score > 0.6:
+                confidence = 0.8
+            elif combined_score > 0.45:
+                confidence = 0.65
+            elif combined_score > 0.3:
+                confidence = 0.5
+            else:
+                confidence = 0.3
+            result["confidence"] = confidence
+
+        # Sort by combined score
+        ranked_results = sorted(initial_results, key=lambda x: x["combined_score"], reverse=True)
+
+        # Filter based on presence of *query* keywords/entities (important!)
+        logger.info(f"🔍 Filtering {len(ranked_results)} re-ranked chunks for *query* keyword/entity presence...")
+        filtered_results = []
+        query_terms_lower = {k.lower() for k in keywords} | {e.lower() for e in entities}
+
+        # If the query itself has no terms, but it's a follow-up, rely on topic terms for filtering?
+        # Or maybe skip filtering if query terms are absent? Let's skip for now.
+        if not query_terms_lower and intent_type not in ["follow_up", "clarification"]:
+             logger.warning("⚠️ No keywords or entities found in query analysis, and not a follow-up. Skipping filtering.")
+             filtered_results = ranked_results
+        elif not query_terms_lower and intent_type in ["follow_up", "clarification"]:
+             logger.warning("⚠️ No keywords or entities in query, but it's a follow-up/clarification. Filtering based on *topic* terms.")
+             filter_terms = {t.lower() for t in topic_terms_set} # Use topic terms for filtering
+             if not filter_terms:
+                 logger.warning("⚠️ No topic terms found either. Skipping filtering.")
+                 filtered_results = ranked_results
+             else:
+                 for result in ranked_results:
+                     text_lower = result["text"].lower()
+                     # Check for topic terms instead of query terms
+                     if any(re.search(r'\b' + re.escape(term) + r'\b', text_lower) for term in filter_terms):
+                         filtered_results.append(result)
+        else:
+             # Standard filtering based on query terms
+             filter_terms = query_terms_lower
+             for result in ranked_results:
+                 text_lower = result["text"].lower()
+                 if any(re.search(r'\b' + re.escape(term) + r'\b', text_lower) for term in filter_terms):
+                     filtered_results.append(result)
+
+
+        logger.info(f"✅ Filtered down to {len(filtered_results)} chunks containing relevant terms.")
+        logger.debug("Top 5 Filtered & Re-ranked Chunks (Combined | Sem | Key | Ent | Top | Conf | Page):")
+        for i, r in enumerate(filtered_results[:5]):
+            page = r.get("metadata", {}).get("page", "?")
+            logger.debug(f"{i+1}. Score={r['combined_score']:.3f} (S:{r['semantic_score']:.2f} K:{r['keyword_score']:.2f} E:{r['entity_score']:.2f} T:{r['topic_score']:.2f}) | Conf={r['confidence']:.2f} | Page={page} | Text: {r['text'][:100]}...")
+
+        total_rerank_time = time.time() - rerank_start_time
+        logger.info(f"Step 2b: Re-ranking & Filtering took: {total_rerank_time:.4f}s")
+        return filtered_results
+
+
+    def _simple_expand_query(self, query_analysis: dict, max_expansions: int = 2) -> list[str]:
+        """Generates simple query variations based on keywords and entities."""
+        expansions = []
+        keywords = query_analysis.get("keywords", [])
+        entities = query_analysis.get("entities", [])
+        # Consider adding topic terms if it's a follow-up with few query terms?
+        intent_type = query_analysis.get("intent_type", "new_topic")
+        topic_keywords = query_analysis.get("topic_keywords", [])
+        topic_entities = query_analysis.get("topic_entities", [])
+
+        terms = list(set(entities + keywords))
+
+        # If few terms in query but it's a follow-up, add topic terms to expansion base
+        if len(terms) < 2 and intent_type in ["follow_up", "clarification"]:
+            logger.debug("Expanding query using topic terms for follow-up.")
+            terms.extend(topic_keywords)
+            terms.extend(topic_entities)
+            terms = list(set(terms)) # Ensure uniqueness
+
+        if not terms:
+            return []
+
+        # Prioritize entities for combinations
+        priority_terms = entities if entities else keywords
+        other_terms = keywords if entities else []
+
+        # Generate pairs (priority x other, priority x priority)
+        pairs = []
+        if priority_terms and other_terms:
+             pairs.extend(list(itertools.product(priority_terms, other_terms)))
+        if len(priority_terms) >= 2:
+             pairs.extend(list(itertools.combinations(priority_terms, 2)))
+
+        # Add single terms if not enough pairs
+        if len(pairs) < max_expansions:
+             pairs.extend([(t,) for t in terms]) # Add single terms
+
+        # Create expansion strings
+        for pair in pairs:
+             expansions.append(" ".join(pair))
+             if len(expansions) >= max_expansions:
+                 break
+
+        # Fallback: if still no expansions, use top terms directly
+        if not expansions and terms:
+             expansions.extend(terms[:max_expansions])
+
+        unique_expansions = list(dict.fromkeys(expansions)) # Maintain order while making unique
+        logger.debug(f"Generated query expansions: {unique_expansions[:max_expansions]}")
+        return unique_expansions[:max_expansions]
+
+
+    def run(self, query: str, query_analysis: dict, initial_top_k: int = DEFAULT_HYBRID_INITIAL_TOP_K, final_top_k: int = 5):
+        """Retrieves chunks using semantic search (with expansion), filters and re-ranks them."""
+        run_start_time = time.time()
+        logger.info(f"🔎 Running hybrid retrieval for: '{query}' (Initial K={initial_top_k}, Final K={final_top_k})")
+        logger.debug(f"Query Analysis for Retrieval: {query_analysis}") # Log full analysis
+
+        expansion_start_time = time.time()
+        # Use original query if analysis didn't refine, otherwise use refined
+        query_to_expand = query_analysis.get("original_query", query) # Use original for expansion base
+        expanded_queries = self._simple_expand_query(query_analysis)
+        all_queries = [query_to_expand] + expanded_queries # Include original query
+
+        query_embeddings = []
+        for q in all_queries:
+            emb = embed_text(q)
+            if emb:
+                query_embeddings.append(np.array([emb]).astype("float32"))
+            else:
+                 logger.warning(f"Failed to generate embedding for query variant: '{q}'")
+
+        if not query_embeddings:
+             logger.error("Failed to generate any query embeddings.")
+             return []
+
+        expansion_time = time.time() - expansion_start_time
+        logger.info(f"Step 2a: Query expansion