feat: update docker-compose.yml to remove standalone service; enhance agent_retriever and ragflow_client for new URL references; modify search_web_tool for RAGFlow search integration; add WEB_UOS collection name and search engine types

yesidc · yesidc · commit f80560549e77 · 2025-11-11T12:11:48.000+01:00
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -18,7 +18,6 @@ services:
     ports:
       - "8501:8501"
     depends_on:
-      - standalone
       - redis
     healthcheck:
       test: ["CMD", "curl", "--fail", "http://localhost:8501/_stcore/health"]
@@ -52,65 +51,7 @@ services:
       --maxmemory 1024mb
       --maxmemory-policy allkeys-lru
       --maxmemory-samples 5
-  etcd:
-    container_name: milvus-etcd
-    image: quay.io/coreos/etcd:v3.5.16
-    environment:
-      - ETCD_AUTO_COMPACTION_MODE=revision
-      - ETCD_AUTO_COMPACTION_RETENTION=1000
-      - ETCD_QUOTA_BACKEND_BYTES=4294967296
-      - ETCD_SNAPSHOT_COUNT=50000
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
-    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
-    healthcheck:
-      test: ["CMD", "etcdctl", "endpoint", "health"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-
-  minio:
-    container_name: milvus-minio
-    image: minio/minio:RELEASE.2023-03-20T20-16-18Z
-    environment:
-      MINIO_ACCESS_KEY: minioadmin
-      MINIO_SECRET_KEY: minioadmin
-    ports:
-      - "9001:9001"
-      - "9000:9000"
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
-    command: minio server /minio_data --console-address ":9001"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-
-  standalone:
-    container_name: milvus-standalone
-    image: milvusdb/milvus:v2.5.4
-    command: ["milvus", "run", "standalone"]
-    security_opt:
-    - seccomp:unconfined
-    environment:
-      ETCD_ENDPOINTS: etcd:2379
-      MINIO_ADDRESS: minio:9000
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
-      - ./milvus.yaml:/milvus/configs/milvus.yaml
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
-      interval: 30s
-      start_period: 90s
-      timeout: 20s
-      retries: 3
-    ports:
-      - "19530:19530"
-      - "9091:9091"
-    depends_on:
-      - "etcd"
-      - "minio"
+ 
   
 volumes:
   sqlite-data:
diff --git a/src/chatbot/agents/utils/agent_retriever.py b/src/chatbot/agents/utils/agent_retriever.py
@@ -7,6 +7,7 @@
 from src.config.core_config import settings
 from src.config.models import CollectionNames, VectorDBTypes
 
+
 DOCUMENT_SEPARATOR = "\n\n"
 NOT_FOUND_MESSAGE = "Result: No documents found"
 
@@ -22,6 +23,7 @@ class Reference(NamedTuple):
     doc_id: str | None = None
     # TODO Delete once metadata is added to RAGFlow API (user to reference FAQ source)
     url_reference_askuos: str | None = None
+    url_reference_web_uos: str | None = None
 
 
 def retrieve_from_infinity_ragflow(collection_name: str, query: str):
@@ -40,6 +42,7 @@ def retrieve_from_infinity_ragflow(collection_name: str, query: str):
                     page,
                     retrieved_item.chunk.document_id,
                     retrieved_item.chunk.url_reference_askuos,
+                    retrieved_item.chunk.url_reference_web_uos,
                 )
             )
             results.append(f"Source: {source} \nText: {retrieved_item.chunk.content}")
diff --git a/src/chatbot/db/ragflow_client.py b/src/chatbot/db/ragflow_client.py
@@ -1,4 +1,5 @@
 import os
+import re
 import threading
 from typing import Any, List, NamedTuple, Optional
 
@@ -42,6 +43,18 @@ def url_reference_askuos(self) -> str:
         file_name = os.path.splitext(self.document_keyword.replace("_", "/"))[0]
         return f"{FAQ_BASE_URL}{file_name}"
 
+    @property
+    def url_reference_web_uos(self):
+        """Extract metadata from markdown content."""
+
+        # Decode bytes to string if needed
+
+        match = re.search(r'url:\s*"([^"]+)"', self.content)
+        if match:
+            url = match.group(1)
+
+        return url or None
+
     @property
     def page(self) -> int:
         """Compute the page number from positions."""
diff --git a/src/chatbot/tools/search_web_tool.py b/src/chatbot/tools/search_web_tool.py
@@ -28,6 +28,7 @@
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 from src.chatbot.agents.utils.agent_helpers import llm_optional as sumarize_llm
+from src.chatbot.agents.utils.agent_retriever import retrieve_from_infinity_ragflow
 
 # from src.chatbot.db.redis_client import redis_manager
 from src.chatbot.tools.utils.custom_crawl import (
@@ -42,6 +43,7 @@
 from src.chatbot.tools.utils.tool_helpers import decode_string
 from src.chatbot_log.chatbot_logger import logger
 from src.config.core_config import settings
+from src.config.models import CollectionNames, SearchEngineTypes, VectorDBTypes
 
 colorama.init(strip=True)
 
@@ -363,16 +365,39 @@ async def async_search(client, **kwargs) -> Tuple[str, List]:
 
         agent_executor = kwargs["agent_executor"]
 
-        visited_urls, contents = await visit_urls_extract(
-            url=url,
-            query=query,
-            agent_executor=agent_executor,
-            about_application=about_application,
-            do_not_visit_links=do_not_visit_links,
-            client=client,
-        )
+        def extract_urls_from_content(refs):
+            visited_urls = []
+            for r in refs:
+                visited_urls.append(r.url_reference_web_uos)
+            return visited_urls
+
+        SEARCH_TYPE = settings.application.search_engine_type
+        if SEARCH_TYPE == SearchEngineTypes.RAGFlow_search:
+            try:
+                contents, ref = retrieve_from_infinity_ragflow(
+                    CollectionNames.WEB_UOS, query
+                )
+                visited_urls = extract_urls_from_content(ref)
+                final_output = contents
+                print()
+
+            except Exception as e:
+                logger.error(f"[RAGFlow] Error during retrieval: {e}")
+                final_output = ""
+                visited_urls = []
 
-        final_output = "\n".join(contents)
+        else:
+
+            visited_urls, contents = await visit_urls_extract(
+                url=url,
+                query=query,
+                agent_executor=agent_executor,
+                about_application=about_application,
+                do_not_visit_links=do_not_visit_links,
+                client=client,
+            )
+
+            final_output = "\n".join(contents)
 
         if final_output:
             # For testing
@@ -383,7 +408,7 @@ async def async_search(client, **kwargs) -> Tuple[str, List]:
             logger.info(
                 f"[SEARCH] Final output (search + prompt): {final_output_tokens}"
             )
-
+            # TODO: change the cache_key if the search engine is ragflow
             # Cache results
             if len(final_output) > 20:
                 cache_value = str((final_output, visited_urls))
diff --git a/src/config/models.py b/src/config/models.py
@@ -6,6 +6,11 @@
 EmbeddingType = Literal["FastEmbed", "Ollama"]
 
 
+class SearchEngineTypes(str, Enum):
+    GOOGLE_CUSTOM_SEARCH = "GoogleSearch"
+    RAGFlow_search = "RAGFlowSearch"
+
+
 class VectorDBTypes(str, Enum):
     MILVUS = "Milvus"
     INFINITY_RAGFLOW = "Infinity-RAGFlow"
@@ -19,6 +24,7 @@ class CollectionNames(str, Enum):
     EXAMINATION_REGULATIONS = "examination_regulations"
     FAQ = "faq"
     TROUBLESHOOTING = "troubleshooting"
+    WEB_UOS = "WEB_UOS"
 
 
 class SearchConfig(BaseModel):
@@ -57,6 +63,7 @@ class ApplicationConfig(BaseModel):
     recursion_limit: int = 12
     tracing: bool = False
     opik_project_name: str = "askUOSTesting"
+    search_engine_type: SearchEngineTypes = SearchEngineTypes.GOOGLE_CUSTOM_SEARCH
 
 
 class EmbeddingConnectionSettings(BaseModel):
diff --git a/tests/warm_up.py b/tests/warm_up.py
@@ -9,6 +9,7 @@
 
 # do not include REPEATED queries or very similar queries (tests will fail)
 warm_up_queries = [
+    "Welche Schnupperangebote bietet die Uni OS?",
     "According to the examination regulations, how are the thesis and oral exam graded?, Mathematics",
     "Wo liegt der NC bei Sport?",
     "hi",
@@ -21,7 +22,6 @@
     "Wie viele ECTS-Punkte habe ich in meinem Bachelor (Biologie)?",
     "Muss ich im Grundschullehramt Mathe und Deutsch studieren?",
     "I cannot log into HisInOne, what can I do?",
-    "Welche Schnupperangebote bietet die Uni OS?",
     "Kann ich Biologie und Sport auf Lehramt studieren?",
     "What are the application deadlines for the fall and spring semesters for the Computer Science Program?",
     "Was kann ich tun, wenn ich keinen Studienplatz im Master Sport bekommen habe?",