Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -156,30 +156,35 @@ add_executable(parse.exe "${TOPLEVEL_PREFIX_PATH}/app/parse.cpp")
add_executable(parse_fonts.exe "${TOPLEVEL_PREFIX_PATH}/app/parse_fonts.cpp")
add_executable(render.exe "${TOPLEVEL_PREFIX_PATH}/app/render.cpp")
add_executable(analyse.exe "${TOPLEVEL_PREFIX_PATH}/app/analyse.cpp")
add_executable(run_scaling.exe "${TOPLEVEL_PREFIX_PATH}/app/run_scaling.cpp")
# add_executable(page_images.exe "${TOPLEVEL_PREFIX_PATH}/app/page_images.cpp")

set_property(TARGET parse.exe PROPERTY CXX_STANDARD 20)
set_property(TARGET parse_fonts.exe PROPERTY CXX_STANDARD 20)
set_property(TARGET render.exe PROPERTY CXX_STANDARD 20)
set_property(TARGET analyse.exe PROPERTY CXX_STANDARD 20)
set_property(TARGET run_scaling.exe PROPERTY CXX_STANDARD 20)
# set_property(TARGET page_images.exe PROPERTY CXX_STANDARD 20)

add_dependencies(parse.exe ${DEPENDENCIES})
add_dependencies(parse_fonts.exe ${DEPENDENCIES})
add_dependencies(render.exe ${DEPENDENCIES})
add_dependencies(analyse.exe ${DEPENDENCIES})
add_dependencies(run_scaling.exe ${DEPENDENCIES})
# add_dependencies(page_images.exe ${DEPENDENCIES})

target_include_directories(parse.exe INTERFACE ${DEPENDENCIES})
target_include_directories(parse_fonts.exe INTERFACE ${DEPENDENCIES})
target_include_directories(render.exe INTERFACE ${DEPENDENCIES})
target_include_directories(analyse.exe INTERFACE ${DEPENDENCIES})
target_include_directories(run_scaling.exe INTERFACE ${DEPENDENCIES})
# target_include_directories(page_images.exe INTERFACE ${DEPENDENCIES})

target_link_libraries(parse.exe ${DEPENDENCIES} ${LIB_LINK})
target_link_libraries(parse_fonts.exe ${DEPENDENCIES} ${LIB_LINK})
target_link_libraries(render.exe ${DEPENDENCIES} ${LIB_LINK})
target_link_libraries(analyse.exe ${DEPENDENCIES} ${LIB_LINK})
target_link_libraries(run_scaling.exe ${DEPENDENCIES} ${LIB_LINK})
# target_link_libraries(page_images.exe ${DEPENDENCIES} ${LIB_LINK})

# **********************
Expand Down
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,17 +92,17 @@ Parse pages from one or more PDFs in parallel using a thread pool with backpress

```python
from docling_parse.pdf_parser import (
DecodeConfig,
DoclingThreadedPdfParser,
ThreadedPdfParserConfig,
)
from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import]

parser_config = ThreadedPdfParserConfig(
loglevel="fatal",
threads=4, # worker threads
max_concurrent_results=32 # cap buffered results to limit memory
)
decode_config = DecodePageConfig()
decode_config = DecodeConfig()

parser = DoclingThreadedPdfParser(
parser_config=parser_config,
Expand Down Expand Up @@ -191,6 +191,12 @@ The latter will only work after a clean `git clone`. If you are developing and u
rm -rf .venv; uv venv; uv pip install --force-reinstall --no-deps -e ".[perf-tools]"
```

or

```sh
BUILD_THREADS=12 uv pip install --force-reinstall --no-deps -e ".[perf]"
```

To test the package, run:

```sh
Expand Down
64 changes: 55 additions & 9 deletions app/pybind_parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,6 @@ PYBIND11_MODULE(pdf_parsers, m) {
max_num_bitmaps (int): Maximum number of bitmaps to keep (-1 means no cap) [default=-1].
keep_glyphs (bool): If true, keep GLYPH<...> fallback strings in output; if false, replace them with a space [default=false].
keep_qpdf_warnings (bool): If true, QPDF warnings are emitted; if false, they are suppressed [default=false].
materialize_bitmap_bytes (bool): If true (default), bitmap byte data is extracted and embedded in BitmapResource objects. If false, only bitmap geometry (rectangles) is preserved and image bytes are skipped. Consumed by the Python layer only; has no effect in C++ [default=true].
)")
.def(pybind11::init<>())
.def_readwrite("page_boundary", &pdflib::decode_config::page_boundary)
Expand All @@ -297,7 +296,6 @@ PYBIND11_MODULE(pdf_parsers, m) {
.def_readwrite("release_native_memory_every_n_pages", &pdflib::decode_config::release_native_memory_every_n_pages)
.def_readwrite("keep_glyphs", &pdflib::decode_config::keep_glyphs)
.def_readwrite("keep_qpdf_warnings", &pdflib::decode_config::keep_qpdf_warnings)
.def_readwrite("materialize_bitmap_bytes", &pdflib::decode_config::materialize_bitmap_bytes)
.def("__copy__", [](const pdflib::decode_config& self) { return self; })
.def("__deepcopy__", [](const pdflib::decode_config& self, pybind11::dict) { return self; });

Expand Down Expand Up @@ -827,8 +825,29 @@ PYBIND11_MODULE(pdf_parsers, m) {

// ============= Threaded PDF Parser =============

pybind11::class_<docling::page_decode_timings>(m, "_PageDecodeTimings",
R"(
Top-level timing breakdown for a threaded page decode task.
)")
.def_readonly("make_page_decoder_s", &docling::page_decode_timings::make_page_decoder_s)
.def_readonly("decode_page_s", &docling::page_decode_timings::decode_page_s)
.def_readonly("create_word_cells_s", &docling::page_decode_timings::create_word_cells_s)
.def_readonly("create_line_cells_s", &docling::page_decode_timings::create_line_cells_s)
.def_readonly("total_s", &docling::page_decode_timings::total_s);

pybind11::class_<docling::page_render_timings, docling::page_decode_timings>(m, "_PageRenderTimings",
R"(
Top-level timing breakdown for a threaded page render task.
)")
.def_readonly("render_page_s", &docling::page_render_timings::render_page_s);

pybind11::class_<docling::page_task_result>(m, "_PageTaskResult")
.def_readonly("doc_key", &docling::page_task_result::doc_key)
.def_readonly("page_number", &docling::page_task_result::page_number)
.def_readonly("success", &docling::page_task_result::success);

// _PageDecodeResult - internal result of a threaded page decode task
pybind11::class_<docling::page_decode_result>(m, "_PageDecodeResult",
pybind11::class_<docling::page_decode_result, docling::page_task_result>(m, "_PageDecodeResult",
R"(
Internal result of a threaded page decoding task.

Expand All @@ -837,9 +856,7 @@ PYBIND11_MODULE(pdf_parsers, m) {
page_number (int): The page number (0-indexed).
success (bool): Whether the decoding succeeded.
)")
.def_readonly("doc_key", &docling::page_decode_result::doc_key)
.def_readonly("page_number", &docling::page_decode_result::page_number)
.def_readonly("success", &docling::page_decode_result::success)
.def_readonly("timings", &docling::page_decode_result::timings)
.def("get", [](docling::page_decode_result& self)
-> std::pair<std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>>,
std::unordered_map<std::string, double>> {
Expand Down Expand Up @@ -1035,16 +1052,45 @@ PYBIND11_MODULE(pdf_parsers, m) {
.def_readwrite("canvas_height", &pdflib::render_config::canvas_height);

// _PageRenderResult - internal result of a threaded page render task
pybind11::class_<docling::page_render_result, docling::page_decode_result>(m, "_PageRenderResult",
pybind11::class_<docling::page_render_result, docling::page_task_result>(m, "_PageRenderResult",
R"(
Internal result of a threaded page rendering task.

Inherits all attributes of _PageDecodeResult and adds rendered image data.

Attributes:
doc_key (str): The document key this page belongs to.
page_number (int): The page number (0-indexed).
success (bool): Whether the rendering succeeded.
timings: Top-level timing breakdown for decode and render stages.
image_data: Raw RGBA bytes of the rendered page (height x width x 4, row-major).
image_shape: Shape of the image as [height, width, channels].
)")
.def_readonly("timings", &docling::page_render_result::timings)
.def("get", [](docling::page_render_result& self)
-> std::pair<std::shared_ptr<pdflib::pdf_decoder<pdflib::PAGE>>,
std::unordered_map<std::string, double>> {
if(!self.success)
{
throw std::runtime_error("Cannot get result from failed task: " + self.error_message);
}
auto timings_map = self.page_decoder->get_timings().to_sum_map();
return std::make_pair(self.page_decoder, timings_map);
},
R"(
Get the page decoder and decoder-internal timing information.

Returns:
Tuple[PdfPageDecoder, Dict[str, float]]: The page decoder and timing data.

Raises:
RuntimeError: If the task was not successful.)")
.def("error", [](docling::page_render_result& self) -> std::string {
return self.error_message;
},
R"(
Get the error message if the task failed.

Returns:
str: The error message.)")
.def_readonly("image_shape", &docling::page_render_result::image_shape)
.def("get_image", [](docling::page_render_result& self)
-> pybind11::bytes {
Expand Down
Loading
Loading