Skip to content

Commit dbd96be

Browse files
INITIALZIED NEW ARCHITECTURE
1 parent ebaf4a4 commit dbd96be

4 files changed

Lines changed: 131 additions & 6 deletions

File tree

new_architecture_v4.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
# Imports
2+
import os
3+
import json
4+
from dotenv import load_dotenv
5+
from sentence_transformers import SentenceTransformer
6+
from unstructured.partition.pdf import partition_pdf
7+
from unstructured.partition.docx import partition_docx
8+
from langchain.text_splitter import RecursiveCharacterTextSplitter
9+
from pinecone import Pinecone, ServerlessSpec
10+
load_dotenv()
11+
# 1. Embedding model
12+
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
13+
14+
# 2. Pinecone initialization (new SDK)
15+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY_NEW"))
16+
17+
# 3. Create or connect to index
18+
index_name = "medical-chatbot-index"
19+
if index_name not in pc.list_indexes().names():
20+
pc.create_index(
21+
name=index_name,
22+
dimension=768, # mpnet-base-v2 output size
23+
metric="cosine",
24+
spec=ServerlessSpec(cloud="aws", region="us-east-1")
25+
)
26+
27+
index = pc.Index(index_name)
28+
29+
print("HERE")
30+
# 4. Quick sanity check
31+
sample_text = "WHO guidelines for managing preeclampsia"
32+
embedding_vector = embedding_model.encode(sample_text, convert_to_numpy=True)
33+
print("Embedding vector shape:", embedding_vector.shape) # (768,)
34+
35+
splitter = RecursiveCharacterTextSplitter(
36+
chunk_size=800, # keeps chunks semantic + embedding friendly
37+
chunk_overlap=100
38+
)
39+
40+
def parse_document(filepath):
41+
"""Parse PDF/DOCX using Unstructured to preserve layout info."""
42+
if filepath.endswith(".pdf"):
43+
elements = partition_pdf(filename=filepath, strategy="hi_res")
44+
elif filepath.endswith(".docx"):
45+
elements = partition_docx(filename=filepath)
46+
else:
47+
raise ValueError("Unsupported file type.")
48+
return elements
49+
50+
51+
def process_elements(elements, doc_name):
52+
"""Convert unstructured elements → semantic chunks with metadata."""
53+
docs = []
54+
55+
for i, el in enumerate(elements):
56+
el_text = el.text.strip()
57+
if not el_text:
58+
continue
59+
60+
# Split into smaller semantic chunks
61+
chunks = splitter.split_text(el_text)
62+
63+
for j, chunk in enumerate(chunks):
64+
metadata = {
65+
"doc_name": doc_name,
66+
"section_type": el.category, # paragraph, title, table, list, etc.
67+
"page_num": getattr(el.metadata, "page_number", None),
68+
"chunk_id": f"{i}-{j}",
69+
}
70+
71+
docs.append({"text": chunk, "metadata": metadata})
72+
73+
return docs
74+
75+
76+
def embed_and_push(docs):
77+
"""Embed and upsert into Pinecone with metadata."""
78+
vectors = []
79+
for d in docs:
80+
emb = embedding_model.encode(d["text"]).tolist()
81+
vec_id = f'{d["metadata"]["doc_name"]}-{d["metadata"]["chunk_id"]}'
82+
vectors.append(
83+
{
84+
"id": vec_id,
85+
"values": emb,
86+
"metadata": {**d["metadata"], "text": d["text"]}
87+
}
88+
)
89+
90+
index.upsert(vectors)
91+
92+
93+
# --------------------------
94+
# Main Loop (overnight run)
95+
# --------------------------
96+
97+
def process_folder(folder_path):
98+
for filename in os.listdir(folder_path):
99+
filepath = os.path.join(folder_path, filename)
100+
if not (filename.endswith(".pdf") or filename.endswith(".docx")):
101+
continue
102+
103+
print(f"Processing {filename}...")
104+
elements = parse_document(filepath)
105+
docs = process_elements(elements, doc_name=filename)
106+
embed_and_push(docs)
107+
print(f"✅ {filename} processed and pushed to Pinecone.")
108+
109+
110+
# --------------------------
111+
# Run
112+
# --------------------------
113+
if __name__ == "__main__":
114+
folder = r"D:\Documents\RHL-RAG-PROJECT\FILES" # change to your folder path
115+
process_folder(folder)

new_architeture_v3.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,9 @@ def medical_chatbot_pipeline(query, chat_history, retrieved_chunks, context_foll
311311
chitchat_prompt = PromptTemplate( input_variables=["conversation", "chat_history"],
312312
template=""" I am your cheerful bot 😃.
313313
Rules: -
314-
- Always use professional words even if user has no professional words.
315-
- Reply to {conversation} in a friendly, chatty yet very professional tone respond with witty, empathetic tone.
314+
- Be humourous and chirpy
315+
- You have no technical expertise you can just reply to {conversation} in such a way that adresses customers requests in a friendly, chatty yet very professional tone respond with witty, empathetic tone.
316+
- Refraining in giving technical response reply in a formal conversation bot style and insist user to ask any medical questions
316317
- Refrain from answering any off-topic questions, delegate to ask users to asking about medical questions
317318
For eg :
318319
User : how are you doing ?
@@ -385,10 +386,10 @@ def route_intent(user_message: str):
385386
context_answer = "\n\n".join(selected_chunks[:4])
386387
context_followup = "\n\n".join(selected_chunks[4:6] if len(selected_chunks) > 4 else [])
387388

388-
print("context_answer==========", context_answer)
389+
#print("context_answer==========", context_answer)
389390
print()
390391
print()
391-
print("context_followup===========", context_followup)
392+
#print("context_followup===========", context_followup)
392393
print()
393394

394395
answer = medical_chatbot_pipeline(rewritten,chat_history,context_answer,context_followup,llm,llm)

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,6 @@ langchain-huggingface
77
pinecone
88
python-dotenv
99
sentence-transformers
10+
fitz
11+
frontend
12+
unstructured

test.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
1-
a= "hsakjfcbsjkvbosd;avbn wsdkbv safvasfvasf\n\nafgdasfgwefgasdfgv\nvsfgvsgvasdgvsedfv"
2-
print(a)
31

42

3+
4+
5+
6+
from dotenv import load_dotenv
7+
import os
8+
import os
9+
load_dotenv()
10+
print(os.getenv("PINECONE_API_KEY"))
511
## follow up question basis answer and the prompt
612

713
# shiva ellur

0 commit comments

Comments
 (0)