GLM-OCR/glmocr/config.yaml at main · zai-org/GLM-OCR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# GLM-OCR Configuration
#
# This file contains all configuration options for the GLM-OCR SDK.
# Default values are shown. Uncomment and modify as needed.

# Server settings (for glmocr.server)
server:
  host: "0.0.0.0"
  port: 5002
  debug: false

# Logging settings
logging:
  # Log level: DEBUG, INFO, WARNING, ERROR
  # DEBUG enables profiling output with timing information
  level: INFO
  # format: "[%(levelname)s] %(name)s: %(message)s"  # custom format (optional)

# Pipeline settings
pipeline:
  # ============================================================================
  # MaaS Mode (Zhipu Cloud API)
  # ============================================================================
  # When enabled, the SDK forwards requests directly to Zhipu's cloud API
  # without local processing. The cloud service handles layout detection,
  # OCR, and result formatting internally.
  #
  # Use this mode when:
  # - You don't have a GPU or don't want to self-host
  # - You want the simplest setup (just need an API key)
  # - You want to use the exact same service as Zhipu's cloud offering
  #
  # Note: When maas.enabled=true, the ocr_api and layout settings below are ignored.
  maas:
    enabled: true # Set to true to use MaaS mode
    api_url: https://open.bigmodel.cn/api/paas/v4/layout_parsing
    model: glm-ocr
    api_key: null # Required! Get from https://open.bigmodel.cn
    verify_ssl: true
    connect_timeout: 30
    request_timeout: 300
    retry_max_attempts: 2
    retry_backoff_base_seconds: 0.5
    retry_backoff_max_seconds: 8.0
    retry_jitter_ratio: 0.2
    retry_status_codes: [429, 500, 502, 503, 504]
    connection_pool_size: 16

  # ============================================================================
  # Self-hosted Mode (vLLM / SGLang)
  # ============================================================================
  # When maas.enabled=false, the SDK uses the settings below to connect to
  # a self-hosted vLLM or SGLang service running the GLM-OCR model.
  #
  # Use this mode when:
  # - You have GPU resources and want full control
  # - You need offline/air-gapped operation
  # - You want to customize the pipeline (layout detection, prompts, etc.)

  # OCR API client configuration (for self-hosted vLLM/SGLang/Ollama)
  ocr_api:
    # Basic connection
    api_host: 127.0.0.1
    api_port: 8080

    # Model name included in API requests.
    # Required for mlx_vlm.server (e.g. "mlx-community/GLM-OCR-bf16").
    # Set to `glm-ocr` to match `--served-model-name` when using vLLM/SGLang.
    # For Ollama, set to your model name (e.g. "glm-ocr:latest").
    model: glm-ocr

    # URL construction: {api_scheme}://{api_host}:{api_port}{api_path}
    # Or set api_url directly to override
    api_scheme: null # null = auto (https if port 443, else http)
    api_path: /v1/chat/completions
    # Note: If using Ollama and encountering 502 errors with vision requests,
    # try switching to "ollama_generate" mode with api_path: /api/generate
    api_url: null # full URL override (optional)

    # API mode: "openai" (default) or "ollama_generate"
    # - "openai": Use OpenAI-compatible /v1/chat/completions endpoint (vLLM/SGLang/Ollama)
    # - "ollama_generate": Use Ollama's native /api/generate endpoint
    api_mode: openai

    # Authentication (for MaaS providers like Zhipu, OpenAI, etc.)
    api_key: null # or set ZHIPU_API_KEY env var
    headers: {} # additional HTTP headers

    # SSL/TLS
    verify_ssl: false

    # Timeouts (seconds)
    connect_timeout: 30
    request_timeout: 120

    # Retry settings (helps with transient 429/5xx and overloaded OCR servers)
    retry_max_attempts: 2
    retry_backoff_base_seconds: 0.5
    retry_backoff_max_seconds: 8.0
    retry_jitter_ratio: 0.2
    retry_status_codes: [429, 500, 502, 503, 504]

    # HTTP connection pool size (default 128). Set >= max_workers to avoid
    # "Connection pool is full" when runs concurrent requests.
    connection_pool_size: 128

  # Maximum parallel workers for region recognition
  # Lower values to reduce 503 errors on busy OCR servers
  max_workers: 32
  # Queue sizes
  page_maxsize: 100
  region_maxsize: 2000

  # Page loader: handles image/PDF loading and API request building
  page_loader:
    # Generation parameters
    max_tokens: 8192
    temperature: 0.0
    top_p: 0.00001
    top_k: 1
    repetition_penalty: 1.1

    # Image processing
    t_patch_size: 2
    patch_expand_factor: 1
    image_expect_length: 6144
    image_format: JPEG # JPEG, PNG, WEBP
    min_pixels: 12544 # 112 * 112
    max_pixels: 71372800 # 14 * 14 * 4 * 1280

    # Task-specific prompts
    task_prompt_mapping:
      text: "Text Recognition:"
      table: "Table Recognition:"
      formula: "Formula Recognition:"

    # PDF processing
    pdf_dpi: 200
    pdf_max_pages: null # null = no limit
    pdf_verbose: false

  # Result formatter: post-processing and output formatting
  result_formatter:
    # Output format: json, markdown, or both
    output_format: both
    # Post-process switches
    enable_merge_formula_numbers: true
    enable_merge_text_blocks: true
    enable_format_bullet_points: true

    # Label to visualization category mapping (for layout visualization)
    label_visualization_mapping:
      table:
        - table
      formula:
        - display_formula
        - inline_formula
      image:
        - chart
        - image
      text:
        - abstract
        - algorithm
        - content
        - doc_title
        - figure_title
        - paragraph_title
        - reference_content
        - text
        - vertical_text
        - vision_footnote
        - seal
        - formula_number

  # Layout detection settings
  layout:
    # PP-DocLayoutV3 model directory
    # Can be a local folder or a Hugging Face model id
    # (Use *_safetensors for Transformers; PaddlePaddle/PP-DocLayoutV3 is a PaddleOCR export)
    model_dir: PaddlePaddle/PP-DocLayoutV3_safetensors

    # Detection threshold
    threshold: 0.3
    # threshold_by_class:           # per-class threshold override
    #   0: 0.5
    #   1: 0.3
    #   text: 0.5
    #   table: 0.2

    # Processing
    # batch_size: max images per model forward pass (reduce to 1 if OOM)
    batch_size: 1
    workers: 1
    cuda_visible_devices: "0"
    # Explicit device placement for the layout model.
    # - null (default): auto-select (CUDA if available via cuda_visible_devices,
    #   otherwise CPU).
    # - "cpu": force CPU even when a GPU is available. Useful for single-GPU
    #   setups where the GPU is reserved for the OCR model.
    # - "cuda" or "cuda:N": use a specific GPU (overrides cuda_visible_devices).
    # Can also be set via GLMOCR_LAYOUT_DEVICE env var or --layout-device CLI flag.
    # device: null
    # img_size: null                # resize input (optional)

    # Use polygon masks for region cropping and visualization.
    # When true, regions are cropped using the polygon outline from layout
    # detection (more precise, masks out content outside the polygon),
    # recommended for documents with rotating or staggered layouts.
    # When false, regions are cropped using the bounding box only (faster, simpler),
    # recommended for regular documents without rotating.
    use_polygon: false

    # Post-processing
    layout_nms: true
    layout_unclip_ratio:
      - 1.0
      - 1.0

    # Merge mode for overlapping bboxes: "large" or "small"
    # Can be a single value or per-class dict
    layout_merge_bboxes_mode:
      0: large # abstract
      1: large # algorithm
      2: large # aside_text
      3: large # chart
      4: large # content
      5: large # display_formula
      6: large # doc_title
      7: large # figure_title
      8: large # footer
      9: large # footer
      10: large # footnote
      11: large # formula_number
      12: large # header
      13: large # header
      14: large # image
      15: large # inline_formula
      16: large # number
      17: large # paragraph_title
      18: small # reference
      19: large # reference_content
      20: large # seal
      21: large # table
      22: large # text
      23: large # vertical_text
      24: large # vision_footnote

    # Map detected labels to OCR task types
    # - text/table/formula: OCR with corresponding prompt
    # - skip: keep region but don't OCR (e.g., images)
    # - abandon: discard region entirely
    label_task_mapping:
      text:
        - abstract
        - algorithm
        - content
        - doc_title
        - figure_title
        - paragraph_title
        - reference_content
        - text
        - vertical_text
        - vision_footnote
        - seal
        - formula_number
      table:
        - table
      formula:
        - display_formula
        - inline_formula
      skip:
        - chart
        - image
      abandon:
        - header
        - footer
        - number
        - footnote
        - aside_text
        - reference
        - footer_image
        - header_image

    # Map label index to label name
    id2label:
      0: abstract
      1: algorithm
      2: aside_text
      3: chart
      4: content
      5: display_formula
      6: doc_title
      7: figure_title
      8: footer
      9: footer_image
      10: footnote
      11: formula_number
      12: header
      13: header_image
      14: image
      15: inline_formula
      16: number
      17: paragraph_title
      18: reference
      19: reference_content
      20: seal
      21: table
      22: text
      23: vertical_text
      24: vision_footnote