Advanced-AI-Driven-Legal-Document-Summarization-and-Risk-Assessment-main/rag_document_loader.py at main · amarcoder01/Advanced-AI-Driven-Legal-Document-Summarization-and-Risk-Assessment-main · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
Document loading and processing module.
"""
import os
from typing import List, Dict, Any, Optional
from langchain_community.document_loaders import (
    PyPDFLoader,
    TextLoader,
    CSVLoader,
    UnstructuredMarkdownLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

from rag_config import CHUNK_SIZE, CHUNK_OVERLAP, DOCUMENT_DIR

class DocumentProcessor:
    """Handles document loading and preprocessing for RAG."""

    def __init__(self, document_dir: Optional[str] = None):
        """Initialize the document processor.

        Args:
            document_dir: Directory containing documents to process
        """
        self.document_dir = document_dir or DOCUMENT_DIR
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
            length_function=len,
        )

    def _get_loader(self, file_path: str):
        """Get the appropriate loader based on file extension."""
        file_extension = os.path.splitext(file_path)[1].lower()

        if file_extension == '.pdf':
            return PyPDFLoader(file_path)
        elif file_extension == '.txt':
            return TextLoader(file_path)
        elif file_extension == '.csv':
            return CSVLoader(file_path)
        elif file_extension in ['.md', '.markdown']:
            return UnstructuredMarkdownLoader(file_path)
        else:
            raise ValueError(f"Unsupported file type: {file_extension}")

    def load_document(self, file_path: str) -> List[Document]:
        """Load and split a single document.

        Args:
            file_path: Path to the document

        Returns:
            List of document chunks
        """
        loader = self._get_loader(file_path)
        documents = loader.load()
        return self.text_splitter.split_documents(documents)

    def load_documents(self, directory: Optional[str] = None) -> List[Document]:
        """Load all documents from a directory.

        Args:
            directory: Directory containing documents (defaults to self.document_dir)

        Returns:
            List of document chunks
        """
        directory = directory or self.document_dir
        all_documents = []

        if not os.path.exists(directory):
            os.makedirs(directory)
            print(f"Created document directory: {directory}")
            return all_documents

        for filename in os.listdir(directory):
            file_path = os.path.join(directory, filename)
            if os.path.isfile(file_path):
                try:
                    document_chunks = self.load_document(file_path)
                    all_documents.extend(document_chunks)
                    print(f"Loaded {len(document_chunks)} chunks from {filename}")
                except Exception as e:
                    print(f"Error loading {filename}: {e}")

        return all_documents