|
| 1 | +# ingest_lcc.py |
| 2 | + |
| 3 | +import os |
| 4 | +import json |
| 5 | +import time |
| 6 | +import hashlib |
| 7 | +from dotenv import load_dotenv |
| 8 | + |
| 9 | +from sentence_transformers import SentenceTransformer |
| 10 | +#from unstructured.partition.pdf import partition_pdf |
| 11 | +#from unstructured.partition.docx import partition_docx |
| 12 | +from langchain.text_splitter import RecursiveCharacterTextSplitter |
| 13 | +from pinecone import Pinecone, ServerlessSpec |
| 14 | +from openai import OpenAI # works for both Anthropic + OpenAI-compatible APIs |
| 15 | + |
| 16 | +# --------------------------------------------------------------------- |
| 17 | +# 1. Setup |
| 18 | +# --------------------------------------------------------------------- |
| 19 | +load_dotenv() |
| 20 | +embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2") |
| 21 | + |
| 22 | +pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY_NEW")) |
| 23 | +index_name = "medical-chatbot-index" |
| 24 | + |
| 25 | +if index_name not in pc.list_indexes().names(): |
| 26 | + pc.create_index( |
| 27 | + name=index_name, |
| 28 | + dimension=768, |
| 29 | + metric="cosine", |
| 30 | + spec=ServerlessSpec(cloud="aws", region="us-east-1") |
| 31 | + ) |
| 32 | +index = pc.Index(index_name) |
| 33 | + |
| 34 | +llm_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # set in .env |
| 35 | + |
| 36 | +print("passed all imports ........") |
| 37 | +# --------------------------------------------------------------------- |
| 38 | +# 2. Extraction |
| 39 | +# --------------------------------------------------------------------- |
| 40 | +import fitz # add at top |
| 41 | + |
| 42 | +def extract_elements(filepath): |
| 43 | + docs = [] |
| 44 | + if filepath.endswith(".pdf"): |
| 45 | + doc = fitz.open(filepath) |
| 46 | + print(f"..... at {filepath}") |
| 47 | + for i, page in enumerate(doc): |
| 48 | + text = page.get_text("text") |
| 49 | + if text.strip(): |
| 50 | + docs.append({ |
| 51 | + "text": text, |
| 52 | + "type": "page", |
| 53 | + "metadata": {"page": i+1} |
| 54 | + }) |
| 55 | + elif filepath.endswith(".docx"): |
| 56 | + from unstructured.partition.docx import partition_docx |
| 57 | + elements = partition_docx(filepath) |
| 58 | + for el in elements: |
| 59 | + if el.text.strip(): |
| 60 | + docs.append({ |
| 61 | + "text": el.text, |
| 62 | + "type": el.category, |
| 63 | + "metadata": {} |
| 64 | + }) |
| 65 | + else: |
| 66 | + raise ValueError("Unsupported file type") |
| 67 | + return docs |
| 68 | + |
| 69 | + |
| 70 | +# --------------------------------------------------------------------- |
| 71 | +# 3. Splitting |
| 72 | +# --------------------------------------------------------------------- |
| 73 | +def split_documents(docs, chunk_size=600, chunk_overlap=120): |
| 74 | + splitter = RecursiveCharacterTextSplitter( |
| 75 | + chunk_size=chunk_size, chunk_overlap=chunk_overlap |
| 76 | + ) |
| 77 | + all_chunks = [] |
| 78 | + for doc in docs: |
| 79 | + chunks = splitter.split_text(doc["text"]) |
| 80 | + for i, chunk in enumerate(chunks): |
| 81 | + all_chunks.append({ |
| 82 | + "chunk": chunk, |
| 83 | + "metadata": {**doc["metadata"], "type": doc["type"], "chunk_id": i} |
| 84 | + }) |
| 85 | + return all_chunks |
| 86 | + |
| 87 | + |
| 88 | +# --------------------------------------------------------------------- |
| 89 | +# 4. Anthropic-Style LCC Rewriting |
| 90 | +# --------------------------------------------------------------------- |
| 91 | +def rewrite_chunk_with_context(chunks, window=1): |
| 92 | + rewritten_chunks = [] |
| 93 | + for i, c in enumerate(chunks): |
| 94 | + neighbors = [] |
| 95 | + if i - window >= 0: |
| 96 | + neighbors.append(chunks[i - window]["chunk"]) |
| 97 | + if i + window < len(chunks): |
| 98 | + neighbors.append(chunks[i + window]["chunk"]) |
| 99 | + |
| 100 | + context_text = "\n".join(neighbors) |
| 101 | + prompt = f""" |
| 102 | +You are an agent who is expert in chunking by taking a particular chunk augomenting with local context so that chunk has individual meaning of its own.add() |
| 103 | +For eg . |
| 104 | +
|
| 105 | +Context_text : Jaundice is a disease , it is in liver |
| 106 | +chunk : It is a dangerous disease |
| 107 | +
|
| 108 | +rewritten chunk : Jaundice is a dangerous disease in the liver |
| 109 | +
|
| 110 | +Chunk: |
| 111 | +{c["chunk"]} |
| 112 | +
|
| 113 | +Neighboring context: |
| 114 | +{context_text} |
| 115 | +
|
| 116 | +Rewritten, self-contained chunk: |
| 117 | +""" |
| 118 | + # LLM call (Anthropic Claude or OpenAI GPT-4o, depending on API key) |
| 119 | + resp = llm_client.chat.completions.create( |
| 120 | + model="gpt-4o-mini", # swap with Anthropic Claude if needed |
| 121 | + messages=[{"role": "user", "content": prompt}], |
| 122 | + temperature=0.2, |
| 123 | + max_tokens=300 |
| 124 | + ) |
| 125 | + rewritten = resp.choices[0].message.content.strip() |
| 126 | + rewritten_chunks.append({ |
| 127 | + "chunk": rewritten, |
| 128 | + "metadata": c["metadata"] |
| 129 | + }) |
| 130 | + |
| 131 | + # avoid hammering API |
| 132 | + time.sleep(0.3) |
| 133 | + return rewritten_chunks |
| 134 | + |
| 135 | + |
| 136 | +# --------------------------------------------------------------------- |
| 137 | +# 5. Embedding + Pinecone |
| 138 | +# --------------------------------------------------------------------- |
| 139 | +def batch_embed_upsert(chunks, batch_size=32): |
| 140 | + for i in range(0, len(chunks), batch_size): |
| 141 | + batch = chunks[i:i+batch_size] |
| 142 | + texts = [c["chunk"] for c in batch] |
| 143 | + embeddings = embedding_model.encode(texts, convert_to_numpy=True) |
| 144 | + |
| 145 | + vectors = [] |
| 146 | + for j, (text, emb) in enumerate(zip(texts, embeddings)): |
| 147 | + uid = hashlib.md5(text.encode()).hexdigest() |
| 148 | + vectors.append({ |
| 149 | + "id": uid, |
| 150 | + "values": emb.tolist(), |
| 151 | + "metadata": { |
| 152 | + "text": text, |
| 153 | + **batch[j]["metadata"] |
| 154 | + } |
| 155 | + }) |
| 156 | + |
| 157 | + index.upsert(vectors) |
| 158 | + print(f"Upserted {len(vectors)} vectors") |
| 159 | + |
| 160 | + |
| 161 | +# --------------------------------------------------------------------- |
| 162 | +# 6. Orchestration |
| 163 | +# --------------------------------------------------------------------- |
| 164 | +def process_file(filepath): |
| 165 | + docs = extract_elements(filepath) |
| 166 | + base_chunks = split_documents(docs) |
| 167 | + lcc_chunks = rewrite_chunk_with_context(base_chunks, window=1) |
| 168 | + batch_embed_upsert(lcc_chunks) |
| 169 | + |
| 170 | + |
| 171 | +if __name__ == "__main__": |
| 172 | + folder = r"D:\Documents\RHL-RAG-PROJECT\FILES" |
| 173 | + for file in os.listdir(folder): |
| 174 | + if file.endswith((".pdf", ".docx")): |
| 175 | + print(f"processing {file}....") |
| 176 | + process_file(os.path.join(folder, file)) |
0 commit comments