-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrag_document_loader.py
More file actions
88 lines (73 loc) · 3 KB
/
rag_document_loader.py
File metadata and controls
88 lines (73 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Document loading and processing module.
"""
import os
from typing import List, Dict, Any, Optional
from langchain_community.document_loaders import (
PyPDFLoader,
TextLoader,
CSVLoader,
UnstructuredMarkdownLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from rag_config import CHUNK_SIZE, CHUNK_OVERLAP, DOCUMENT_DIR
class DocumentProcessor:
"""Handles document loading and preprocessing for RAG."""
def __init__(self, document_dir: Optional[str] = None):
"""Initialize the document processor.
Args:
document_dir: Directory containing documents to process
"""
self.document_dir = document_dir or DOCUMENT_DIR
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
length_function=len,
)
def _get_loader(self, file_path: str):
"""Get the appropriate loader based on file extension."""
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension == '.pdf':
return PyPDFLoader(file_path)
elif file_extension == '.txt':
return TextLoader(file_path)
elif file_extension == '.csv':
return CSVLoader(file_path)
elif file_extension in ['.md', '.markdown']:
return UnstructuredMarkdownLoader(file_path)
else:
raise ValueError(f"Unsupported file type: {file_extension}")
def load_document(self, file_path: str) -> List[Document]:
"""Load and split a single document.
Args:
file_path: Path to the document
Returns:
List of document chunks
"""
loader = self._get_loader(file_path)
documents = loader.load()
return self.text_splitter.split_documents(documents)
def load_documents(self, directory: Optional[str] = None) -> List[Document]:
"""Load all documents from a directory.
Args:
directory: Directory containing documents (defaults to self.document_dir)
Returns:
List of document chunks
"""
directory = directory or self.document_dir
all_documents = []
if not os.path.exists(directory):
os.makedirs(directory)
print(f"Created document directory: {directory}")
return all_documents
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if os.path.isfile(file_path):
try:
document_chunks = self.load_document(file_path)
all_documents.extend(document_chunks)
print(f"Loaded {len(document_chunks)} chunks from {filename}")
except Exception as e:
print(f"Error loading {filename}: {e}")
return all_documents