pdf/docling_options_examples.py at main · climateandtech/pdf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
"""
Examples of how to use the generic Docling options system

The system now accepts any valid Docling configuration and passes it through
to the DocumentConverter, making it completely flexible and future-proof.
"""
from docling.document_converter import PdfFormatOption, WordFormatOption, ImageFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    AcceleratorOptions,
    AcceleratorDevice,
    OcrEngine,
    granite_picture_description  # VLM still supported
)


# Example 1: Basic PDF processing with OCR
def get_pdf_with_ocr_options():
    """Configure PDF processing with OCR enabled"""
    return {
        "format_options": {
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=PdfPipelineOptions(
                    do_ocr=True,
                    ocr_engine=OcrEngine.EASYOCR,
                    force_full_page_ocr=True
                )
            )
        }
    }


# Example 2: High-performance configuration
def get_performance_optimized_options():
    """Configure for optimal performance"""
    return {
        "format_options": {
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=PdfPipelineOptions(
                    do_table_structure=True,
                    do_ocr=False  # Skip OCR for speed
                )
            )
        },
        "accelerator_options": AcceleratorOptions(
            num_threads=8,
            device=AcceleratorDevice.CPU
        )
    }


# Example 3: Multi-format support
def get_multi_format_options():
    """Configure support for multiple document formats"""
    return {
        "format_options": {
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=PdfPipelineOptions(
                    do_ocr=True,
                    do_table_structure=True
                )
            ),
            InputFormat.DOCX: WordFormatOption(),
            InputFormat.IMAGE: ImageFormatOption()
        }
    }


# Example 4: VLM (Visual Language Model) configuration
def get_vlm_options():
    """Configure VLM for picture descriptions (requires macOS 13.5+)"""
    return {
        "format_options": {
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=PdfPipelineOptions(
                    do_picture_description=True,
                    picture_description_options=granite_picture_description,
                    images_scale=2.0,
                    generate_picture_images=True
                )
            )
        }
    }


# Example 5: Custom OCR configuration
def get_custom_ocr_options():
    """Configure with specific OCR settings"""
    return {
        "format_options": {
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=PdfPipelineOptions(
                    do_ocr=True,
                    ocr_engine=OcrEngine.TESSERACT_CLI,
                    force_full_page_ocr=False
                )
            )
        }
    }


# Example usage in your application:
async def example_usage():
    """Example of how to use these configurations"""
    from services import DocumentService

    doc_service = DocumentService()
    await doc_service.setup()

    # Use any of the configurations above
    docling_options = get_pdf_with_ocr_options()

    # Process document with custom options
    result = await doc_service.process_document(
        s3_key="my-document.pdf",
        docling_options=docling_options
    )

    print(f"Processed document: {result['status']}")


if __name__ == "__main__":
    import asyncio
    asyncio.run(example_usage())