Skip to content

Commit a9249e1

Browse files
added lcc code
1 parent dbd96be commit a9249e1

4 files changed

Lines changed: 185 additions & 77 deletions

File tree

=2.12

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Collecting pybind11
2+
Downloading pybind11-3.0.1-py3-none-any.whl.metadata (10.0 kB)
3+
Downloading pybind11-3.0.1-py3-none-any.whl (293 kB)
4+
Installing collected packages: pybind11
5+
Successfully installed pybind11-3.0.1

ingest_lcc.py

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
# ingest_lcc.py
2+
3+
import os
4+
import json
5+
import time
6+
import hashlib
7+
from dotenv import load_dotenv
8+
9+
from sentence_transformers import SentenceTransformer
10+
#from unstructured.partition.pdf import partition_pdf
11+
#from unstructured.partition.docx import partition_docx
12+
from langchain.text_splitter import RecursiveCharacterTextSplitter
13+
from pinecone import Pinecone, ServerlessSpec
14+
from openai import OpenAI # works for both Anthropic + OpenAI-compatible APIs
15+
16+
# ---------------------------------------------------------------------
17+
# 1. Setup
18+
# ---------------------------------------------------------------------
19+
load_dotenv()
20+
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
21+
22+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY_NEW"))
23+
index_name = "medical-chatbot-index"
24+
25+
if index_name not in pc.list_indexes().names():
26+
pc.create_index(
27+
name=index_name,
28+
dimension=768,
29+
metric="cosine",
30+
spec=ServerlessSpec(cloud="aws", region="us-east-1")
31+
)
32+
index = pc.Index(index_name)
33+
34+
llm_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # set in .env
35+
36+
print("passed all imports ........")
37+
# ---------------------------------------------------------------------
38+
# 2. Extraction
39+
# ---------------------------------------------------------------------
40+
import fitz # add at top
41+
42+
def extract_elements(filepath):
43+
docs = []
44+
if filepath.endswith(".pdf"):
45+
doc = fitz.open(filepath)
46+
print(f"..... at {filepath}")
47+
for i, page in enumerate(doc):
48+
text = page.get_text("text")
49+
if text.strip():
50+
docs.append({
51+
"text": text,
52+
"type": "page",
53+
"metadata": {"page": i+1}
54+
})
55+
elif filepath.endswith(".docx"):
56+
from unstructured.partition.docx import partition_docx
57+
elements = partition_docx(filepath)
58+
for el in elements:
59+
if el.text.strip():
60+
docs.append({
61+
"text": el.text,
62+
"type": el.category,
63+
"metadata": {}
64+
})
65+
else:
66+
raise ValueError("Unsupported file type")
67+
return docs
68+
69+
70+
# ---------------------------------------------------------------------
71+
# 3. Splitting
72+
# ---------------------------------------------------------------------
73+
def split_documents(docs, chunk_size=600, chunk_overlap=120):
74+
splitter = RecursiveCharacterTextSplitter(
75+
chunk_size=chunk_size, chunk_overlap=chunk_overlap
76+
)
77+
all_chunks = []
78+
for doc in docs:
79+
chunks = splitter.split_text(doc["text"])
80+
for i, chunk in enumerate(chunks):
81+
all_chunks.append({
82+
"chunk": chunk,
83+
"metadata": {**doc["metadata"], "type": doc["type"], "chunk_id": i}
84+
})
85+
return all_chunks
86+
87+
88+
# ---------------------------------------------------------------------
89+
# 4. Anthropic-Style LCC Rewriting
90+
# ---------------------------------------------------------------------
91+
def rewrite_chunk_with_context(chunks, window=1):
92+
rewritten_chunks = []
93+
for i, c in enumerate(chunks):
94+
neighbors = []
95+
if i - window >= 0:
96+
neighbors.append(chunks[i - window]["chunk"])
97+
if i + window < len(chunks):
98+
neighbors.append(chunks[i + window]["chunk"])
99+
100+
context_text = "\n".join(neighbors)
101+
prompt = f"""
102+
You are an agent who is expert in chunking by taking a particular chunk augomenting with local context so that chunk has individual meaning of its own.add()
103+
For eg .
104+
105+
Context_text : Jaundice is a disease , it is in liver
106+
chunk : It is a dangerous disease
107+
108+
rewritten chunk : Jaundice is a dangerous disease in the liver
109+
110+
Chunk:
111+
{c["chunk"]}
112+
113+
Neighboring context:
114+
{context_text}
115+
116+
Rewritten, self-contained chunk:
117+
"""
118+
# LLM call (Anthropic Claude or OpenAI GPT-4o, depending on API key)
119+
resp = llm_client.chat.completions.create(
120+
model="gpt-4o-mini", # swap with Anthropic Claude if needed
121+
messages=[{"role": "user", "content": prompt}],
122+
temperature=0.2,
123+
max_tokens=300
124+
)
125+
rewritten = resp.choices[0].message.content.strip()
126+
rewritten_chunks.append({
127+
"chunk": rewritten,
128+
"metadata": c["metadata"]
129+
})
130+
131+
# avoid hammering API
132+
time.sleep(0.3)
133+
return rewritten_chunks
134+
135+
136+
# ---------------------------------------------------------------------
137+
# 5. Embedding + Pinecone
138+
# ---------------------------------------------------------------------
139+
def batch_embed_upsert(chunks, batch_size=32):
140+
for i in range(0, len(chunks), batch_size):
141+
batch = chunks[i:i+batch_size]
142+
texts = [c["chunk"] for c in batch]
143+
embeddings = embedding_model.encode(texts, convert_to_numpy=True)
144+
145+
vectors = []
146+
for j, (text, emb) in enumerate(zip(texts, embeddings)):
147+
uid = hashlib.md5(text.encode()).hexdigest()
148+
vectors.append({
149+
"id": uid,
150+
"values": emb.tolist(),
151+
"metadata": {
152+
"text": text,
153+
**batch[j]["metadata"]
154+
}
155+
})
156+
157+
index.upsert(vectors)
158+
print(f"Upserted {len(vectors)} vectors")
159+
160+
161+
# ---------------------------------------------------------------------
162+
# 6. Orchestration
163+
# ---------------------------------------------------------------------
164+
def process_file(filepath):
165+
docs = extract_elements(filepath)
166+
base_chunks = split_documents(docs)
167+
lcc_chunks = rewrite_chunk_with_context(base_chunks, window=1)
168+
batch_embed_upsert(lcc_chunks)
169+
170+
171+
if __name__ == "__main__":
172+
folder = r"D:\Documents\RHL-RAG-PROJECT\FILES"
173+
for file in os.listdir(folder):
174+
if file.endswith((".pdf", ".docx")):
175+
print(f"processing {file}....")
176+
process_file(os.path.join(folder, file))

new_architecture_v4.py

Lines changed: 3 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
from langchain.text_splitter import RecursiveCharacterTextSplitter
99
from pinecone import Pinecone, ServerlessSpec
1010
load_dotenv()
11+
from pdf2image import convert_from_path
12+
13+
1114
# 1. Embedding model
1215
embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
1316

@@ -36,80 +39,3 @@
3639
chunk_size=800, # keeps chunks semantic + embedding friendly
3740
chunk_overlap=100
3841
)
39-
40-
def parse_document(filepath):
41-
"""Parse PDF/DOCX using Unstructured to preserve layout info."""
42-
if filepath.endswith(".pdf"):
43-
elements = partition_pdf(filename=filepath, strategy="hi_res")
44-
elif filepath.endswith(".docx"):
45-
elements = partition_docx(filename=filepath)
46-
else:
47-
raise ValueError("Unsupported file type.")
48-
return elements
49-
50-
51-
def process_elements(elements, doc_name):
52-
"""Convert unstructured elements → semantic chunks with metadata."""
53-
docs = []
54-
55-
for i, el in enumerate(elements):
56-
el_text = el.text.strip()
57-
if not el_text:
58-
continue
59-
60-
# Split into smaller semantic chunks
61-
chunks = splitter.split_text(el_text)
62-
63-
for j, chunk in enumerate(chunks):
64-
metadata = {
65-
"doc_name": doc_name,
66-
"section_type": el.category, # paragraph, title, table, list, etc.
67-
"page_num": getattr(el.metadata, "page_number", None),
68-
"chunk_id": f"{i}-{j}",
69-
}
70-
71-
docs.append({"text": chunk, "metadata": metadata})
72-
73-
return docs
74-
75-
76-
def embed_and_push(docs):
77-
"""Embed and upsert into Pinecone with metadata."""
78-
vectors = []
79-
for d in docs:
80-
emb = embedding_model.encode(d["text"]).tolist()
81-
vec_id = f'{d["metadata"]["doc_name"]}-{d["metadata"]["chunk_id"]}'
82-
vectors.append(
83-
{
84-
"id": vec_id,
85-
"values": emb,
86-
"metadata": {**d["metadata"], "text": d["text"]}
87-
}
88-
)
89-
90-
index.upsert(vectors)
91-
92-
93-
# --------------------------
94-
# Main Loop (overnight run)
95-
# --------------------------
96-
97-
def process_folder(folder_path):
98-
for filename in os.listdir(folder_path):
99-
filepath = os.path.join(folder_path, filename)
100-
if not (filename.endswith(".pdf") or filename.endswith(".docx")):
101-
continue
102-
103-
print(f"Processing {filename}...")
104-
elements = parse_document(filepath)
105-
docs = process_elements(elements, doc_name=filename)
106-
embed_and_push(docs)
107-
print(f"✅ {filename} processed and pushed to Pinecone.")
108-
109-
110-
# --------------------------
111-
# Run
112-
# --------------------------
113-
if __name__ == "__main__":
114-
folder = r"D:\Documents\RHL-RAG-PROJECT\FILES" # change to your folder path
115-
process_folder(folder)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ sentence-transformers
1010
fitz
1111
frontend
1212
unstructured
13+
pdfminer

0 commit comments

Comments
 (0)