1+ import asyncio
12import logging
23import os
34from typing import List , Optional , Dict , Any
2324from api .openrouter_client import OpenRouterClient
2425from api .azureai_client import AzureAIClient
2526from api .dashscope_client import DashscopeClient
26- from api .rag import RAG
27+ from api .rag import RAG , Memory
28+ from api .rag_session import rag_session_manager
2729
2830# Configure logging
2931from api .logging_config import setup_logging
@@ -60,6 +62,70 @@ class ChatCompletionRequest(BaseModel):
6062 included_dirs : Optional [str ] = Field (None , description = "Comma-separated list of directories to include exclusively" )
6163 included_files : Optional [str ] = Field (None , description = "Comma-separated list of file patterns to include exclusively" )
6264
65+ async def generate_with_retry (rag , query , context_docs , provider , model , language = "en" , max_retries = 3 ):
66+ """Generate content with retry and context reduction on failure.
67+
68+ On token limit errors, reduces context by 50% per retry.
69+ On transient errors (timeout, 503, 429), retries with exponential backoff.
70+ Non-retryable errors are raised immediately.
71+
72+ Args:
73+ rag: RAG instance to use for generation
74+ query: The user query
75+ context_docs: List of retrieved documents
76+ provider: AI provider name
77+ model: Model name
78+ language: Language code for content generation
79+ max_retries: Maximum number of retry attempts
80+
81+ Returns:
82+ Retrieved documents result from RAG
83+ """
84+ context_fraction = 1.0
85+
86+ for attempt in range (max_retries ):
87+ try :
88+ # Reduce context on retries
89+ if context_fraction < 1.0 and context_docs :
90+ reduced_count = max (1 , int (len (context_docs ) * context_fraction ))
91+ docs_to_use = context_docs [:reduced_count ]
92+ logger .info (f"Using { len (docs_to_use )} /{ len (context_docs )} context docs "
93+ f"({ context_fraction :.0%} )" )
94+ else :
95+ docs_to_use = context_docs
96+
97+ result = rag (query , language = language )
98+ return result
99+
100+ except Exception as e :
101+ error_str = str (e ).lower ()
102+
103+ # Token limit errors -- reduce context
104+ if any (phrase in error_str for phrase in [
105+ 'maximum context length' , 'token limit' , 'too many tokens' ,
106+ 'content too large' , 'request too large' , 'input too long'
107+ ]):
108+ context_fraction *= 0.5
109+ logger .warning (f"Token limit hit, reducing context to { context_fraction :.0%} "
110+ f"(attempt { attempt + 1 } /{ max_retries } )" )
111+ continue
112+
113+ # Transient errors -- retry with backoff
114+ if any (phrase in error_str for phrase in [
115+ 'timeout' , 'connection' , '503' , '502' , '429' , 'rate limit'
116+ ]):
117+ wait_time = (2 ** attempt ) # 1s, 2s, 4s
118+ logger .warning (f"Transient error, retrying in { wait_time } s "
119+ f"(attempt { attempt + 1 } /{ max_retries } ): { e } " )
120+ await asyncio .sleep (wait_time )
121+ continue
122+
123+ # Non-retryable error
124+ raise
125+
126+ raise Exception (f"Failed after { max_retries } retries with context at { context_fraction :.0%} " )
127+
128+
63129async def handle_websocket_chat (websocket : WebSocket ):
64130 """
65131 Handle WebSocket connection for chat completions.
@@ -83,10 +149,8 @@ async def handle_websocket_chat(websocket: WebSocket):
83149 logger .warning (f"Request exceeds recommended token limit ({ tokens } > 7500)" )
84150 input_too_large = True
85151
86- # Create a new RAG instance for this request
152+ # Create or reuse a cached RAG instance for this request
87153 try :
88- request_rag = RAG (provider = request .provider , model = request .model )
89-
90154 # Extract custom file filter parameters if provided
91155 excluded_dirs = None
92156 excluded_files = None
@@ -106,8 +170,28 @@ async def handle_websocket_chat(websocket: WebSocket):
106170 included_files = [unquote (file_pattern ) for file_pattern in request .included_files .split ('\n ' ) if file_pattern .strip ()]
107171 logger .info (f"Using custom included files: { included_files } " )
108172
109- request_rag .prepare_retriever (request .repo_url , request .type , request .token , excluded_dirs , excluded_files , included_dirs , included_files )
110- logger .info (f"Retriever prepared for { request .repo_url } " )
173+ # Check for a cached RAG session (only when no custom file filters)
174+ has_custom_filters = any ([excluded_dirs , excluded_files , included_dirs , included_files ])
175+ from api .config import get_embedder_type
176+ embedder_type = get_embedder_type ()
177+ session_key = rag_session_manager .get_session_key (request .repo_url , embedder_type ) if not has_custom_filters else None
178+ request_rag = rag_session_manager .get (session_key ) if session_key else None
179+
180+ if request_rag is not None :
181+ # Reuse cached RAG instance, update provider/model for this request
182+ request_rag .provider = request .provider
183+ request_rag .model = request .model
184+ # Reset memory for this new conversation
185+ request_rag .memory = Memory ()
186+ logger .info (f"Reusing cached RAG session for { request .repo_url } " )
187+ else :
188+ # Create a new RAG instance
189+ request_rag = RAG (provider = request .provider , model = request .model )
190+ request_rag .prepare_retriever (request .repo_url , request .type , request .token , excluded_dirs , excluded_files , included_dirs , included_files )
191+ # Cache the session if no custom filters were used
192+ if session_key :
193+ rag_session_manager .put (session_key , request_rag )
194+ logger .info (f"Created new RAG session for { request .repo_url } " )
111195 except ValueError as e :
112196 if "No valid documents with embeddings found" in str (e ):
113197 logger .error (f"No valid embeddings found: { str (e )} " )
@@ -202,10 +286,14 @@ async def handle_websocket_chat(websocket: WebSocket):
202286 rag_query = f"Contexts related to { request .filePath } "
203287 logger .info (f"Modified RAG query to focus on file: { request .filePath } " )
204288
205- # Try to perform RAG retrieval
289+ # Try to perform RAG retrieval with retry logic
206290 try :
207- # This will use the actual RAG implementation
208- retrieved_documents = request_rag (rag_query , language = request .language )
291+ # Use retry wrapper for resilient retrieval
292+ retrieved_documents = await generate_with_retry (
293+ request_rag , rag_query , None ,
294+ request .provider , request .model ,
295+ language = request .language
296+ )
209297
210298 if retrieved_documents and retrieved_documents [0 ].documents :
211299 # Format context for the prompt in a more structured way
0 commit comments