-
Notifications
You must be signed in to change notification settings - Fork 561
Expand file tree
/
Copy pathconfig.yaml
More file actions
310 lines (282 loc) · 9.34 KB
/
config.yaml
File metadata and controls
310 lines (282 loc) · 9.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
# GLM-OCR Configuration
#
# This file contains all configuration options for the GLM-OCR SDK.
# Default values are shown. Uncomment and modify as needed.
# Server settings (for glmocr.server)
server:
host: "0.0.0.0"
port: 5002
debug: false
# Logging settings
logging:
# Log level: DEBUG, INFO, WARNING, ERROR
# DEBUG enables profiling output with timing information
level: INFO
# format: "[%(levelname)s] %(name)s: %(message)s" # custom format (optional)
# Pipeline settings
pipeline:
# ============================================================================
# MaaS Mode (Zhipu Cloud API)
# ============================================================================
# When enabled, the SDK forwards requests directly to Zhipu's cloud API
# without local processing. The cloud service handles layout detection,
# OCR, and result formatting internally.
#
# Use this mode when:
# - You don't have a GPU or don't want to self-host
# - You want the simplest setup (just need an API key)
# - You want to use the exact same service as Zhipu's cloud offering
#
# Note: When maas.enabled=true, the ocr_api and layout settings below are ignored.
maas:
enabled: true # Set to true to use MaaS mode
api_url: https://open.bigmodel.cn/api/paas/v4/layout_parsing
model: glm-ocr
api_key: null # Required! Get from https://open.bigmodel.cn
verify_ssl: true
connect_timeout: 30
request_timeout: 300
retry_max_attempts: 2
retry_backoff_base_seconds: 0.5
retry_backoff_max_seconds: 8.0
retry_jitter_ratio: 0.2
retry_status_codes: [429, 500, 502, 503, 504]
connection_pool_size: 16
# ============================================================================
# Self-hosted Mode (vLLM / SGLang)
# ============================================================================
# When maas.enabled=false, the SDK uses the settings below to connect to
# a self-hosted vLLM or SGLang service running the GLM-OCR model.
#
# Use this mode when:
# - You have GPU resources and want full control
# - You need offline/air-gapped operation
# - You want to customize the pipeline (layout detection, prompts, etc.)
# OCR API client configuration (for self-hosted vLLM/SGLang/Ollama)
ocr_api:
# Basic connection
api_host: 127.0.0.1
api_port: 8080
# Model name included in API requests.
# Required for mlx_vlm.server (e.g. "mlx-community/GLM-OCR-bf16").
# Set to `glm-ocr` to match `--served-model-name` when using vLLM/SGLang.
# For Ollama, set to your model name (e.g. "glm-ocr:latest").
model: glm-ocr
# URL construction: {api_scheme}://{api_host}:{api_port}{api_path}
# Or set api_url directly to override
api_scheme: null # null = auto (https if port 443, else http)
api_path: /v1/chat/completions
# Note: If using Ollama and encountering 502 errors with vision requests,
# try switching to "ollama_generate" mode with api_path: /api/generate
api_url: null # full URL override (optional)
# API mode: "openai" (default) or "ollama_generate"
# - "openai": Use OpenAI-compatible /v1/chat/completions endpoint (vLLM/SGLang/Ollama)
# - "ollama_generate": Use Ollama's native /api/generate endpoint
api_mode: openai
# Authentication (for MaaS providers like Zhipu, OpenAI, etc.)
api_key: null # or set ZHIPU_API_KEY env var
headers: {} # additional HTTP headers
# SSL/TLS
verify_ssl: false
# Timeouts (seconds)
connect_timeout: 30
request_timeout: 120
# Retry settings (helps with transient 429/5xx and overloaded OCR servers)
retry_max_attempts: 2
retry_backoff_base_seconds: 0.5
retry_backoff_max_seconds: 8.0
retry_jitter_ratio: 0.2
retry_status_codes: [429, 500, 502, 503, 504]
# HTTP connection pool size (default 128). Set >= max_workers to avoid
# "Connection pool is full" when runs concurrent requests.
connection_pool_size: 128
# Maximum parallel workers for region recognition
# Lower values to reduce 503 errors on busy OCR servers
max_workers: 32
# Queue sizes
page_maxsize: 100
region_maxsize: 2000
# Page loader: handles image/PDF loading and API request building
page_loader:
# Generation parameters
max_tokens: 8192
temperature: 0.0
top_p: 0.00001
top_k: 1
repetition_penalty: 1.1
# Image processing
t_patch_size: 2
patch_expand_factor: 1
image_expect_length: 6144
image_format: JPEG # JPEG, PNG, WEBP
min_pixels: 12544 # 112 * 112
max_pixels: 71372800 # 14 * 14 * 4 * 1280
# Task-specific prompts
task_prompt_mapping:
text: "Text Recognition:"
table: "Table Recognition:"
formula: "Formula Recognition:"
# PDF processing
pdf_dpi: 200
pdf_max_pages: null # null = no limit
pdf_verbose: false
# Result formatter: post-processing and output formatting
result_formatter:
# Output format: json, markdown, or both
output_format: both
# Post-process switches
enable_merge_formula_numbers: true
enable_merge_text_blocks: true
enable_format_bullet_points: true
# Label to visualization category mapping (for layout visualization)
label_visualization_mapping:
table:
- table
formula:
- display_formula
- inline_formula
image:
- chart
- image
text:
- abstract
- algorithm
- content
- doc_title
- figure_title
- paragraph_title
- reference_content
- text
- vertical_text
- vision_footnote
- seal
- formula_number
# Layout detection settings
layout:
# PP-DocLayoutV3 model directory
# Can be a local folder or a Hugging Face model id
# (Use *_safetensors for Transformers; PaddlePaddle/PP-DocLayoutV3 is a PaddleOCR export)
model_dir: PaddlePaddle/PP-DocLayoutV3_safetensors
# Detection threshold
threshold: 0.3
# threshold_by_class: # per-class threshold override
# 0: 0.5
# 1: 0.3
# text: 0.5
# table: 0.2
# Processing
# batch_size: max images per model forward pass (reduce to 1 if OOM)
batch_size: 1
workers: 1
cuda_visible_devices: "0"
# Explicit device placement for the layout model.
# - null (default): auto-select (CUDA if available via cuda_visible_devices,
# otherwise CPU).
# - "cpu": force CPU even when a GPU is available. Useful for single-GPU
# setups where the GPU is reserved for the OCR model.
# - "cuda" or "cuda:N": use a specific GPU (overrides cuda_visible_devices).
# Can also be set via GLMOCR_LAYOUT_DEVICE env var or --layout-device CLI flag.
# device: null
# img_size: null # resize input (optional)
# Use polygon masks for region cropping and visualization.
# When true, regions are cropped using the polygon outline from layout
# detection (more precise, masks out content outside the polygon),
# recommended for documents with rotating or staggered layouts.
# When false, regions are cropped using the bounding box only (faster, simpler),
# recommended for regular documents without rotating.
use_polygon: false
# Post-processing
layout_nms: true
layout_unclip_ratio:
- 1.0
- 1.0
# Merge mode for overlapping bboxes: "large" or "small"
# Can be a single value or per-class dict
layout_merge_bboxes_mode:
0: large # abstract
1: large # algorithm
2: large # aside_text
3: large # chart
4: large # content
5: large # display_formula
6: large # doc_title
7: large # figure_title
8: large # footer
9: large # footer
10: large # footnote
11: large # formula_number
12: large # header
13: large # header
14: large # image
15: large # inline_formula
16: large # number
17: large # paragraph_title
18: small # reference
19: large # reference_content
20: large # seal
21: large # table
22: large # text
23: large # vertical_text
24: large # vision_footnote
# Map detected labels to OCR task types
# - text/table/formula: OCR with corresponding prompt
# - skip: keep region but don't OCR (e.g., images)
# - abandon: discard region entirely
label_task_mapping:
text:
- abstract
- algorithm
- content
- doc_title
- figure_title
- paragraph_title
- reference_content
- text
- vertical_text
- vision_footnote
- seal
- formula_number
table:
- table
formula:
- display_formula
- inline_formula
skip:
- chart
- image
abandon:
- header
- footer
- number
- footnote
- aside_text
- reference
- footer_image
- header_image
# Map label index to label name
id2label:
0: abstract
1: algorithm
2: aside_text
3: chart
4: content
5: display_formula
6: doc_title
7: figure_title
8: footer
9: footer_image
10: footnote
11: formula_number
12: header
13: header_image
14: image
15: inline_formula
16: number
17: paragraph_title
18: reference
19: reference_content
20: seal
21: table
22: text
23: vertical_text
24: vision_footnote