-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathapi.py
More file actions
132 lines (108 loc) · 4 KB
/
api.py
File metadata and controls
132 lines (108 loc) · 4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import logging
import os
from pathlib import Path
import lmdb
import tomli
import uvicorn
from dotenv import load_dotenv
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from wikisearch.autocomplete.autocompletion_service import AutocompletionService
from wikisearch.db.database_connection import DatabaseConnectionService
from wikisearch.document.document_service import DocumentService
from wikisearch.index.inverted_index import InvertedIndexService
from wikisearch.index.usearch_semantic_index import USearchIndexService
from wikisearch.spell.hunspell_checker import HunSpellChecker
logging.basicConfig(
level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger('WikiSearch')
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["http://localhost:5000"]
)
path_to_config = Path("./config.toml")
with open(path_to_config, "rb") as f:
config = tomli.load(f)
CRAWLER_CONFIG = {
"domain": config["Crawler"].get("Domain", "bg.wikipedia.org"),
"seed_urls": config["Crawler"].get("SeedURLs", "root"),
"crawl_limit": config["Crawler"].get("CrawlLimit", ""),
}
load_dotenv()
DB_CONFIG = {
"host": os.getenv("DB_HOST"),
"user": os.getenv("DB_USER"),
"password": os.getenv("DB_PASSWORD"),
"database": os.getenv("DB_DATABASE"),
}
LMDB_CONFIG = {
"path": config["FileDatabase"].get("Path"),
"size": config["FileDatabase"].get("Size", 10**9)
}
USEARCH_CONFIG = {
"path": config["USearchIndex"].get("Path", "/data/WikiSearchData/SemanticIndex/usearch.index"),
"dimension": config["USearchIndex"].get("Dimension", 768)
}
FAISS_CONFIG = {
"path": config["FAISSIndex"].get("Path", "/data/WikiSearchData/SemanticIndex/faiss.index"),
"dimension": config["FAISSIndex"].get("Dimension", 768)
}
SPELL_CONFIG = {
"aff": config["SpellChecker"].get("AffPath"),
"dic": config["SpellChecker"].get("DicPath"),
"custom_path": config["SpellChecker"].get("CustomDicPath"),
}
AUTOCOMPLETION_CONFIG = {
"word-completion-dawg": config["Autocompletion"].get("WordCompletionDAWG"),
"next-word-dawg": config["Autocompletion"].get("NextWordDAWG"),
}
if not os.path.exists(LMDB_CONFIG["path"]):
os.makedirs(LMDB_CONFIG["path"])
lmdb_env = lmdb.open(LMDB_CONFIG["path"], map_size=int(LMDB_CONFIG["size"]))
with DatabaseConnectionService(DB_CONFIG) as connection:
# add crawler service, if you want to add documents in runtime
inverted_index_service = InvertedIndexService(connection)
semantic_index_service = USearchIndexService(
Path(USEARCH_CONFIG["path"]),
int(USEARCH_CONFIG["dimension"]), 10)
spell_checker_service = HunSpellChecker(
Path(SPELL_CONFIG["aff"]),
Path(SPELL_CONFIG["dic"]))
autocompletion_service = AutocompletionService(
AUTOCOMPLETION_CONFIG["word-completion-dawg"],
AUTOCOMPLETION_CONFIG["next-word-dawg"], 10)
document_service = DocumentService(connection, lmdb_env)
@app.get("/")
async def root():
return {"message": "Welcome to WikiSearch"}
@app.get("/autocomplete")
async def autocomplete(q: str):
if not q:
return []
else:
return autocompletion_service.suggest(q)
@app.get("/search")
async def search(q: str, index: str = "inverted", limit: int = 20, offset: int = 0, spellcheck: bool = True):
q = q.lower()
old_q: str = q
if spellcheck:
q = spell_checker_service.spellcheck(q).lower()
if index == "semantic":
documents = semantic_index_service.search(q, limit, offset)
else:
documents = inverted_index_service.search(q, limit, offset)
results = []
for doc_id, score in documents:
document = document_service.fetch_document(doc_id, score)
results.append(document)
return {
"query": q,
"index": index,
"limit": limit,
"offset": offset,
"correction": (q != old_q),
"results": results
}
if __name__ == "__main__":
uvicorn.run("api:app", host="localhost", port=8080)