Skip to content

Commit f805605

Browse files
committed
feat: update docker-compose.yml to remove standalone service; enhance agent_retriever and ragflow_client for new URL references; modify search_web_tool for RAGFlow search integration; add WEB_UOS collection name and search engine types
1 parent 088dc5f commit f805605

6 files changed

Lines changed: 60 additions & 71 deletions

File tree

docker-compose.yml

Lines changed: 1 addition & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ services:
1818
ports:
1919
- "8501:8501"
2020
depends_on:
21-
- standalone
2221
- redis
2322
healthcheck:
2423
test: ["CMD", "curl", "--fail", "http://localhost:8501/_stcore/health"]
@@ -52,65 +51,7 @@ services:
5251
--maxmemory 1024mb
5352
--maxmemory-policy allkeys-lru
5453
--maxmemory-samples 5
55-
etcd:
56-
container_name: milvus-etcd
57-
image: quay.io/coreos/etcd:v3.5.16
58-
environment:
59-
- ETCD_AUTO_COMPACTION_MODE=revision
60-
- ETCD_AUTO_COMPACTION_RETENTION=1000
61-
- ETCD_QUOTA_BACKEND_BYTES=4294967296
62-
- ETCD_SNAPSHOT_COUNT=50000
63-
volumes:
64-
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
65-
command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
66-
healthcheck:
67-
test: ["CMD", "etcdctl", "endpoint", "health"]
68-
interval: 30s
69-
timeout: 20s
70-
retries: 3
71-
72-
minio:
73-
container_name: milvus-minio
74-
image: minio/minio:RELEASE.2023-03-20T20-16-18Z
75-
environment:
76-
MINIO_ACCESS_KEY: minioadmin
77-
MINIO_SECRET_KEY: minioadmin
78-
ports:
79-
- "9001:9001"
80-
- "9000:9000"
81-
volumes:
82-
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
83-
command: minio server /minio_data --console-address ":9001"
84-
healthcheck:
85-
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
86-
interval: 30s
87-
timeout: 20s
88-
retries: 3
89-
90-
standalone:
91-
container_name: milvus-standalone
92-
image: milvusdb/milvus:v2.5.4
93-
command: ["milvus", "run", "standalone"]
94-
security_opt:
95-
- seccomp:unconfined
96-
environment:
97-
ETCD_ENDPOINTS: etcd:2379
98-
MINIO_ADDRESS: minio:9000
99-
volumes:
100-
- ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
101-
- ./milvus.yaml:/milvus/configs/milvus.yaml
102-
healthcheck:
103-
test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
104-
interval: 30s
105-
start_period: 90s
106-
timeout: 20s
107-
retries: 3
108-
ports:
109-
- "19530:19530"
110-
- "9091:9091"
111-
depends_on:
112-
- "etcd"
113-
- "minio"
54+
11455

11556
volumes:
11657
sqlite-data:

src/chatbot/agents/utils/agent_retriever.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from src.config.core_config import settings
88
from src.config.models import CollectionNames, VectorDBTypes
99

10+
1011
DOCUMENT_SEPARATOR = "\n\n"
1112
NOT_FOUND_MESSAGE = "Result: No documents found"
1213

@@ -22,6 +23,7 @@ class Reference(NamedTuple):
2223
doc_id: str | None = None
2324
# TODO Delete once metadata is added to RAGFlow API (user to reference FAQ source)
2425
url_reference_askuos: str | None = None
26+
url_reference_web_uos: str | None = None
2527

2628

2729
def retrieve_from_infinity_ragflow(collection_name: str, query: str):
@@ -40,6 +42,7 @@ def retrieve_from_infinity_ragflow(collection_name: str, query: str):
4042
page,
4143
retrieved_item.chunk.document_id,
4244
retrieved_item.chunk.url_reference_askuos,
45+
retrieved_item.chunk.url_reference_web_uos,
4346
)
4447
)
4548
results.append(f"Source: {source} \nText: {retrieved_item.chunk.content}")

src/chatbot/db/ragflow_client.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
import re
23
import threading
34
from typing import Any, List, NamedTuple, Optional
45

@@ -42,6 +43,18 @@ def url_reference_askuos(self) -> str:
4243
file_name = os.path.splitext(self.document_keyword.replace("_", "/"))[0]
4344
return f"{FAQ_BASE_URL}{file_name}"
4445

46+
@property
47+
def url_reference_web_uos(self):
48+
"""Extract metadata from markdown content."""
49+
50+
# Decode bytes to string if needed
51+
52+
match = re.search(r'url:\s*"([^"]+)"', self.content)
53+
if match:
54+
url = match.group(1)
55+
56+
return url or None
57+
4558
@property
4659
def page(self) -> int:
4760
"""Compute the page number from positions."""

src/chatbot/tools/search_web_tool.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from langchain.text_splitter import RecursiveCharacterTextSplitter
2929

3030
from src.chatbot.agents.utils.agent_helpers import llm_optional as sumarize_llm
31+
from src.chatbot.agents.utils.agent_retriever import retrieve_from_infinity_ragflow
3132

3233
# from src.chatbot.db.redis_client import redis_manager
3334
from src.chatbot.tools.utils.custom_crawl import (
@@ -42,6 +43,7 @@
4243
from src.chatbot.tools.utils.tool_helpers import decode_string
4344
from src.chatbot_log.chatbot_logger import logger
4445
from src.config.core_config import settings
46+
from src.config.models import CollectionNames, SearchEngineTypes, VectorDBTypes
4547

4648
colorama.init(strip=True)
4749

@@ -363,16 +365,39 @@ async def async_search(client, **kwargs) -> Tuple[str, List]:
363365

364366
agent_executor = kwargs["agent_executor"]
365367

366-
visited_urls, contents = await visit_urls_extract(
367-
url=url,
368-
query=query,
369-
agent_executor=agent_executor,
370-
about_application=about_application,
371-
do_not_visit_links=do_not_visit_links,
372-
client=client,
373-
)
368+
def extract_urls_from_content(refs):
369+
visited_urls = []
370+
for r in refs:
371+
visited_urls.append(r.url_reference_web_uos)
372+
return visited_urls
373+
374+
SEARCH_TYPE = settings.application.search_engine_type
375+
if SEARCH_TYPE == SearchEngineTypes.RAGFlow_search:
376+
try:
377+
contents, ref = retrieve_from_infinity_ragflow(
378+
CollectionNames.WEB_UOS, query
379+
)
380+
visited_urls = extract_urls_from_content(ref)
381+
final_output = contents
382+
print()
383+
384+
except Exception as e:
385+
logger.error(f"[RAGFlow] Error during retrieval: {e}")
386+
final_output = ""
387+
visited_urls = []
374388

375-
final_output = "\n".join(contents)
389+
else:
390+
391+
visited_urls, contents = await visit_urls_extract(
392+
url=url,
393+
query=query,
394+
agent_executor=agent_executor,
395+
about_application=about_application,
396+
do_not_visit_links=do_not_visit_links,
397+
client=client,
398+
)
399+
400+
final_output = "\n".join(contents)
376401

377402
if final_output:
378403
# For testing
@@ -383,7 +408,7 @@ async def async_search(client, **kwargs) -> Tuple[str, List]:
383408
logger.info(
384409
f"[SEARCH] Final output (search + prompt): {final_output_tokens}"
385410
)
386-
411+
# TODO: change the cache_key if the search engine is ragflow
387412
# Cache results
388413
if len(final_output) > 20:
389414
cache_value = str((final_output, visited_urls))

src/config/models.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@
66
EmbeddingType = Literal["FastEmbed", "Ollama"]
77

88

9+
class SearchEngineTypes(str, Enum):
10+
GOOGLE_CUSTOM_SEARCH = "GoogleSearch"
11+
RAGFlow_search = "RAGFlowSearch"
12+
13+
914
class VectorDBTypes(str, Enum):
1015
MILVUS = "Milvus"
1116
INFINITY_RAGFLOW = "Infinity-RAGFlow"
@@ -19,6 +24,7 @@ class CollectionNames(str, Enum):
1924
EXAMINATION_REGULATIONS = "examination_regulations"
2025
FAQ = "faq"
2126
TROUBLESHOOTING = "troubleshooting"
27+
WEB_UOS = "WEB_UOS"
2228

2329

2430
class SearchConfig(BaseModel):
@@ -57,6 +63,7 @@ class ApplicationConfig(BaseModel):
5763
recursion_limit: int = 12
5864
tracing: bool = False
5965
opik_project_name: str = "askUOSTesting"
66+
search_engine_type: SearchEngineTypes = SearchEngineTypes.GOOGLE_CUSTOM_SEARCH
6067

6168

6269
class EmbeddingConnectionSettings(BaseModel):

tests/warm_up.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
# do not include REPEATED queries or very similar queries (tests will fail)
1111
warm_up_queries = [
12+
"Welche Schnupperangebote bietet die Uni OS?",
1213
"According to the examination regulations, how are the thesis and oral exam graded?, Mathematics",
1314
"Wo liegt der NC bei Sport?",
1415
"hi",
@@ -21,7 +22,6 @@
2122
"Wie viele ECTS-Punkte habe ich in meinem Bachelor (Biologie)?",
2223
"Muss ich im Grundschullehramt Mathe und Deutsch studieren?",
2324
"I cannot log into HisInOne, what can I do?",
24-
"Welche Schnupperangebote bietet die Uni OS?",
2525
"Kann ich Biologie und Sport auf Lehramt studieren?",
2626
"What are the application deadlines for the fall and spring semesters for the Computer Science Program?",
2727
"Was kann ich tun, wenn ich keinen Studienplatz im Master Sport bekommen habe?",

0 commit comments

Comments
 (0)