From 569df97343a193fae585bf55e5f062e5b143c1d5 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 29 May 2026 10:41:21 +0200 Subject: [PATCH 1/9] feat: improve speed of pages/sec and scalability Signed-off-by: Peter Staar --- CMakeLists.txt | 5 + app/pybind_parse.cpp | 62 +- app/run_scaling.cpp | 1023 ++++++++++++++++++++++++ docling_parse/pdf_parser.py | 36 + perf/run_scaling.py | 344 ++++++-- perf/run_scaling_visualization.py | 124 +++ src/parse/pdf_decoders/document.h | 35 +- src/parse/pdf_states/text.h | 5 + src/pybind/docling_threaded_base.h | 10 +- src/pybind/docling_threaded_parser.h | 20 + src/pybind/docling_threaded_renderer.h | 32 +- src/pybind/docling_threaded_results.h | 55 ++ tests/test_threaded_parse.py | 2 + tests/test_threaded_render.py | 2 + 14 files changed, 1672 insertions(+), 83 deletions(-) create mode 100644 app/run_scaling.cpp create mode 100644 perf/run_scaling_visualization.py create mode 100644 src/pybind/docling_threaded_results.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 94455edf..fba612d4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -156,30 +156,35 @@ add_executable(parse.exe "${TOPLEVEL_PREFIX_PATH}/app/parse.cpp") add_executable(parse_fonts.exe "${TOPLEVEL_PREFIX_PATH}/app/parse_fonts.cpp") add_executable(render.exe "${TOPLEVEL_PREFIX_PATH}/app/render.cpp") add_executable(analyse.exe "${TOPLEVEL_PREFIX_PATH}/app/analyse.cpp") +add_executable(run_scaling.exe "${TOPLEVEL_PREFIX_PATH}/app/run_scaling.cpp") # add_executable(page_images.exe "${TOPLEVEL_PREFIX_PATH}/app/page_images.cpp") set_property(TARGET parse.exe PROPERTY CXX_STANDARD 20) set_property(TARGET parse_fonts.exe PROPERTY CXX_STANDARD 20) set_property(TARGET render.exe PROPERTY CXX_STANDARD 20) set_property(TARGET analyse.exe PROPERTY CXX_STANDARD 20) +set_property(TARGET run_scaling.exe PROPERTY CXX_STANDARD 20) # set_property(TARGET page_images.exe PROPERTY CXX_STANDARD 20) add_dependencies(parse.exe ${DEPENDENCIES}) add_dependencies(parse_fonts.exe ${DEPENDENCIES}) add_dependencies(render.exe ${DEPENDENCIES}) add_dependencies(analyse.exe ${DEPENDENCIES}) +add_dependencies(run_scaling.exe ${DEPENDENCIES}) # add_dependencies(page_images.exe ${DEPENDENCIES}) target_include_directories(parse.exe INTERFACE ${DEPENDENCIES}) target_include_directories(parse_fonts.exe INTERFACE ${DEPENDENCIES}) target_include_directories(render.exe INTERFACE ${DEPENDENCIES}) target_include_directories(analyse.exe INTERFACE ${DEPENDENCIES}) +target_include_directories(run_scaling.exe INTERFACE ${DEPENDENCIES}) # target_include_directories(page_images.exe INTERFACE ${DEPENDENCIES}) target_link_libraries(parse.exe ${DEPENDENCIES} ${LIB_LINK}) target_link_libraries(parse_fonts.exe ${DEPENDENCIES} ${LIB_LINK}) target_link_libraries(render.exe ${DEPENDENCIES} ${LIB_LINK}) target_link_libraries(analyse.exe ${DEPENDENCIES} ${LIB_LINK}) +target_link_libraries(run_scaling.exe ${DEPENDENCIES} ${LIB_LINK}) # target_link_libraries(page_images.exe ${DEPENDENCIES} ${LIB_LINK}) # ********************** diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp index 38e0aaa8..99a4c5bd 100644 --- a/app/pybind_parse.cpp +++ b/app/pybind_parse.cpp @@ -827,8 +827,29 @@ PYBIND11_MODULE(pdf_parsers, m) { // ============= Threaded PDF Parser ============= + pybind11::class_(m, "_PageDecodeTimings", + R"( + Top-level timing breakdown for a threaded page decode task. + )") + .def_readonly("make_page_decoder_s", &docling::page_decode_timings::make_page_decoder_s) + .def_readonly("decode_page_s", &docling::page_decode_timings::decode_page_s) + .def_readonly("create_word_cells_s", &docling::page_decode_timings::create_word_cells_s) + .def_readonly("create_line_cells_s", &docling::page_decode_timings::create_line_cells_s) + .def_readonly("total_s", &docling::page_decode_timings::total_s); + + pybind11::class_(m, "_PageRenderTimings", + R"( + Top-level timing breakdown for a threaded page render task. + )") + .def_readonly("render_page_s", &docling::page_render_timings::render_page_s); + + pybind11::class_(m, "_PageTaskResult") + .def_readonly("doc_key", &docling::page_task_result::doc_key) + .def_readonly("page_number", &docling::page_task_result::page_number) + .def_readonly("success", &docling::page_task_result::success); + // _PageDecodeResult - internal result of a threaded page decode task - pybind11::class_(m, "_PageDecodeResult", + pybind11::class_(m, "_PageDecodeResult", R"( Internal result of a threaded page decoding task. @@ -837,9 +858,7 @@ PYBIND11_MODULE(pdf_parsers, m) { page_number (int): The page number (0-indexed). success (bool): Whether the decoding succeeded. )") - .def_readonly("doc_key", &docling::page_decode_result::doc_key) - .def_readonly("page_number", &docling::page_decode_result::page_number) - .def_readonly("success", &docling::page_decode_result::success) + .def_readonly("timings", &docling::page_decode_result::timings) .def("get", [](docling::page_decode_result& self) -> std::pair>, std::unordered_map> { @@ -1035,16 +1054,45 @@ PYBIND11_MODULE(pdf_parsers, m) { .def_readwrite("canvas_height", &pdflib::render_config::canvas_height); // _PageRenderResult - internal result of a threaded page render task - pybind11::class_(m, "_PageRenderResult", + pybind11::class_(m, "_PageRenderResult", R"( Internal result of a threaded page rendering task. - Inherits all attributes of _PageDecodeResult and adds rendered image data. - Attributes: + doc_key (str): The document key this page belongs to. + page_number (int): The page number (0-indexed). + success (bool): Whether the rendering succeeded. + timings: Top-level timing breakdown for decode and render stages. image_data: Raw RGBA bytes of the rendered page (height x width x 4, row-major). image_shape: Shape of the image as [height, width, channels]. )") + .def_readonly("timings", &docling::page_render_result::timings) + .def("get", [](docling::page_render_result& self) + -> std::pair>, + std::unordered_map> { + if(!self.success) + { + throw std::runtime_error("Cannot get result from failed task: " + self.error_message); + } + auto timings_map = self.page_decoder->get_timings().to_sum_map(); + return std::make_pair(self.page_decoder, timings_map); + }, + R"( + Get the page decoder and decoder-internal timing information. + + Returns: + Tuple[PdfPageDecoder, Dict[str, float]]: The page decoder and timing data. + + Raises: + RuntimeError: If the task was not successful.)") + .def("error", [](docling::page_render_result& self) -> std::string { + return self.error_message; + }, + R"( + Get the error message if the task failed. + + Returns: + str: The error message.)") .def_readonly("image_shape", &docling::page_render_result::image_shape) .def("get_image", [](docling::page_render_result& self) -> pybind11::bytes { diff --git a/app/run_scaling.cpp b/app/run_scaling.cpp new file mode 100644 index 00000000..347fdfd2 --- /dev/null +++ b/app/run_scaling.cpp @@ -0,0 +1,1023 @@ +//-*-C++-*- + +#include "parse.h" +#include "render.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace +{ + using clock_type = std::chrono::steady_clock; + using doc_decoder_type = pdflib::pdf_decoder; + using page_decoder_type = pdflib::pdf_decoder; + using doc_decoder_ptr = std::shared_ptr; + + enum class run_mode + { + parse, + render, + both, + }; + + struct scheduled_doc + { + std::filesystem::path path; + int total_pages = 0; + std::vector pages; + }; + + struct page_task + { + std::size_t doc_index = 0; + int page_number = 0; + }; + + struct page_timings + { + double total_s = 0.0; + double make_page_decoder_s = 0.0; + double decode_page_s = 0.0; + double create_word_cells_s = 0.0; + double create_line_cells_s = 0.0; + double render_page_s = 0.0; + }; + + struct page_result + { + std::string doc_key; + int page_number = 0; + bool success = false; + std::string error_message; + page_timings timings; + std::shared_ptr page_decoder; + std::shared_ptr> image_data; + }; + + struct benchmark_result + { + int threads = 0; + double wall_time_s = 0.0; + int errors = 0; + }; + + struct cli_options + { + std::filesystem::path input; + run_mode mode = run_mode::render; + bool recursive = false; + std::optional max_pages = std::nullopt; + int max_concurrent_results = 64; + std::vector threads{1, 2, 4, 8, 12, 16}; + float scale = 1.0f; + bool enable_timing = false; + std::filesystem::path timing_csv = "timing-cpp.csv"; + std::string loglevel = "fatal"; + }; + + void set_loglevel(std::string level) + { + std::transform(level.begin(), level.end(), level.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if(level == "info") + { + loguru::g_stderr_verbosity = loguru::Verbosity_INFO; + } + else if(level == "warning" or level == "warn") + { + loguru::g_stderr_verbosity = loguru::Verbosity_WARNING; + } + else if(level == "error") + { + loguru::g_stderr_verbosity = loguru::Verbosity_ERROR; + } + else if(level == "fatal") + { + loguru::g_stderr_verbosity = loguru::Verbosity_FATAL; + } + else + { + loguru::g_stderr_verbosity = loguru::Verbosity_ERROR; + } + } + + std::string mode_to_string(run_mode mode) + { + switch(mode) + { + case run_mode::parse: return "parse"; + case run_mode::render: return "render"; + case run_mode::both: return "both"; + } + return "render"; + } + + std::vector parse_thread_counts(const std::string& raw) + { + std::vector values; + std::stringstream ss(raw); + std::string token; + + while(std::getline(ss, token, ',')) + { + if(token.empty()) + { + continue; + } + + int value = std::stoi(token); + if(value <= 0) + { + throw std::runtime_error("--threads must contain positive integers"); + } + values.push_back(value); + } + + if(values.empty()) + { + throw std::runtime_error("--threads must contain at least one value"); + } + + return values; + } + + bool parse_bool(const std::string& raw) + { + std::string value = raw; + std::transform(value.begin(), value.end(), value.begin(), + [](unsigned char c) { return std::tolower(c); }); + + if(value == "1" or value == "true" or value == "yes" or value == "on") + { + return true; + } + if(value == "0" or value == "false" or value == "no" or value == "off") + { + return false; + } + + throw std::runtime_error("expected boolean value, got: " + raw); + } + + std::string format_duration(double seconds) + { + const int total_seconds = static_cast(seconds); + const int hours = total_seconds / 3600; + const int minutes = (total_seconds % 3600) / 60; + const int secs = total_seconds % 60; + + std::ostringstream ss; + if(hours > 0) + { + ss << hours << ":" + << std::setw(2) << std::setfill('0') << minutes << ":" + << std::setw(2) << std::setfill('0') << secs; + } + else + { + ss << minutes << ":" + << std::setw(2) << std::setfill('0') << secs; + } + return ss.str(); + } + + class progress_bar + { + public: + progress_bar(std::string label, int total): + label_(std::move(label)), + total_(std::max(0, total)), + start_(clock_type::now()), + last_draw_(start_) + { + draw(0, true); + } + + void update(int current) + { + current_ = std::min(std::max(0, current), total_); + const auto now = clock_type::now(); + const double since_last_draw = + std::chrono::duration(now - last_draw_).count(); + if(current_ == total_ or since_last_draw >= 0.1) + { + draw(current_, false); + last_draw_ = now; + } + } + + void finish() + { + draw(current_, true); + std::cerr << "\n"; + } + + private: + void draw(int current, bool force) + { + if(total_ <= 0 and not force) + { + return; + } + + const int width = 40; + const double fraction = total_ > 0 + ? static_cast(current) / static_cast(total_) : 1.0; + const int filled = std::min(width, static_cast(fraction * width)); + const double elapsed = + std::chrono::duration(clock_type::now() - start_).count(); + const double rate = elapsed > 0.0 + ? static_cast(current) / elapsed : 0.0; + const double eta = (current > 0 and total_ > current) + ? elapsed * (static_cast(total_ - current) / static_cast(current)) + : 0.0; + + std::cerr << "\r" << label_ << ": ["; + for(int i = 0; i < width; ++i) + { + std::cerr << (i < filled ? '#' : '-'); + } + std::cerr << "] " + << current << "/" << total_ << " " + << std::fixed << std::setprecision(1) << (fraction * 100.0) + << "% " + << std::fixed << std::setprecision(1) << rate << "/s " + << "elapsed " << format_duration(elapsed) + << " eta " << format_duration(eta) + << " " << std::flush; + } + + private: + std::string label_; + int total_ = 0; + int current_ = 0; + clock_type::time_point start_; + clock_type::time_point last_draw_; + }; + + pdflib::decode_config default_decode_config() + { + pdflib::decode_config config; + config.page_boundary = "crop_box"; + config.do_sanitization = true; + config.keep_char_cells = true; + config.keep_shapes = false; + config.keep_bitmaps = false; + config.max_num_lines = -1; + config.max_num_bitmaps = -1; + config.create_word_cells = false; + config.create_line_cells = false; + config.enforce_same_font = true; + config.horizontal_cell_tolerance = 1.0; + config.word_space_width_factor_for_merge = 0.33; + config.line_space_width_factor_for_merge = 1.0; + config.line_space_width_factor_for_merge_with_space = 0.33; + config.do_thread_safe = true; + config.release_native_memory_every_n_pages = 0; + config.keep_glyphs = false; + config.keep_qpdf_warnings = false; + config.materialize_bitmap_bytes = false; + return config; + } + + pdflib::render_config default_render_config(float scale) + { + pdflib::render_config config; + config.render_text = true; + config.draw_text_bbox = false; + config.draw_text_basepoint = false; + config.fit_glyph_bbox_to_target = false; + config.resolve_fonts = true; + config.font_similarity_cutoff = 0.75f; + config.scale = scale; + config.canvas_width = -1; + config.canvas_height = -1; + return config; + } + + std::vector find_pdfs(const std::filesystem::path& input, + bool recursive) + { + std::vector result; + + if(std::filesystem::is_regular_file(input)) + { + if(input.extension() == ".pdf") + { + result.push_back(input); + } + return result; + } + + if(not std::filesystem::is_directory(input)) + { + return result; + } + + if(recursive) + { + for(const auto& entry : std::filesystem::recursive_directory_iterator(input)) + { + if(entry.is_regular_file() and entry.path().extension() == ".pdf") + { + result.push_back(entry.path()); + } + } + } + else + { + for(const auto& entry : std::filesystem::directory_iterator(input)) + { + if(entry.is_regular_file() and entry.path().extension() == ".pdf") + { + result.push_back(entry.path()); + } + } + } + + std::sort(result.begin(), result.end()); + return result; + } + + int count_pages(const std::filesystem::path& pdf_path) + { + pdflib::pdf_timings timings; + doc_decoder_type doc(timings); + std::string filename = pdf_path.string(); + std::optional password = std::nullopt; + + if(not doc.process_document_from_file(filename, password)) + { + return 0; + } + + return doc.get_number_of_pages(); + } + + std::vector make_schedule(const std::vector& pdfs, + std::optional max_pages, + int& total_pages) + { + std::vector schedule; + int remaining = max_pages.value_or(std::numeric_limits::max()); + total_pages = 0; + int counted = 0; + progress_bar progress("counting pages", static_cast(pdfs.size())); + + for(const auto& pdf_path : pdfs) + { + if(remaining <= 0) + { + break; + } + + const int page_count = count_pages(pdf_path); + ++counted; + progress.update(counted); + if(page_count <= 0) + { + continue; + } + + const int selected = std::min(page_count, remaining); + + scheduled_doc doc; + doc.path = pdf_path; + doc.total_pages = page_count; + doc.pages.reserve(static_cast(selected)); + for(int page = 0; page < selected; ++page) + { + doc.pages.push_back(page); + } + + schedule.push_back(std::move(doc)); + total_pages += selected; + remaining -= selected; + } + + progress.finish(); + return schedule; + } + + std::vector load_documents(const std::vector& schedule) + { + std::vector docs; + docs.reserve(schedule.size()); + + for(const auto& entry : schedule) + { + auto doc = std::make_shared(); + std::string filename = entry.path.string(); + std::optional password = std::nullopt; + + if(not doc->process_document_from_file(filename, password)) + { + throw std::runtime_error("failed to load PDF: " + filename); + } + + docs.push_back(doc); + } + + return docs; + } + + std::vector build_tasks(const std::vector& schedule) + { + std::vector tasks; + for(std::size_t doc_index = 0; doc_index < schedule.size(); ++doc_index) + { + for(int page : schedule[doc_index].pages) + { + tasks.push_back(page_task{doc_index, page}); + } + } + return tasks; + } + + std::string csv_escape(const std::string& value) + { + if(value.find_first_of(",\"\n") == std::string::npos) + { + return value; + } + + std::string escaped = "\""; + for(char c : value) + { + if(c == '"') + { + escaped += "\"\""; + } + else + { + escaped += c; + } + } + escaped += "\""; + return escaped; + } + + class timing_csv_writer + { + public: + timing_csv_writer(bool enabled, const std::filesystem::path& path): + enabled_(enabled) + { + if(not enabled_) + { + return; + } + + const bool write_header = (not std::filesystem::exists(path)) + or std::filesystem::file_size(path) == 0; + out_.open(path, std::ios::app); + if(not out_) + { + throw std::runtime_error("could not open timing csv: " + path.string()); + } + if(write_header) + { + out_ << "mode,threads,render,doc_key,page_number,success," + << "timing_total_s,timing_make_page_decoder_s,timing_decode_page_s," + << "timing_create_word_cells_s,timing_create_line_cells_s," + << "timing_render_page_s,error_message\n"; + } + } + + void write(const std::string& mode, + int threads, + bool render, + const page_result& result) + { + if(not enabled_) + { + return; + } + + out_ << mode << ',' + << threads << ',' + << (render ? "true" : "false") << ',' + << csv_escape(result.doc_key) << ',' + << (result.page_number + 1) << ',' + << (result.success ? "true" : "false") << ',' + << result.timings.total_s << ',' + << result.timings.make_page_decoder_s << ',' + << result.timings.decode_page_s << ',' + << result.timings.create_word_cells_s << ',' + << result.timings.create_line_cells_s << ',' + << result.timings.render_page_s << ',' + << csv_escape(result.error_message) + << '\n'; + } + + private: + bool enabled_; + std::ofstream out_; + }; + + class threaded_benchmark + { + public: + threaded_benchmark(const std::vector& schedule, + const std::vector& docs, + int num_threads, + int max_concurrent_results, + pdflib::decode_config decode_config, + std::optional render_config): + schedule_(schedule), + docs_(docs), + num_threads_(num_threads), + max_concurrent_results_(max_concurrent_results), + decode_config_(decode_config), + render_config_(render_config) + {} + + benchmark_result run(const std::string& mode, + bool enable_timing, + const std::filesystem::path& timing_csv) + { + tasks_ = build_tasks(schedule_); + next_task_.store(0); + tasks_remaining_.store(static_cast(tasks_.size())); + active_workers_.store(std::min(num_threads_, static_cast(tasks_.size()))); + + timing_csv_writer csv_writer(enable_timing, timing_csv); + + auto start = clock_type::now(); + + for(int i = 0; i < active_workers_.load(); ++i) + { + workers_.emplace_back(&threaded_benchmark::worker_loop, this); + } + + int errors = 0; + int completed = 0; + progress_bar progress(render_config_.has_value() ? " rendering" : " parsing", + static_cast(tasks_.size())); + while(tasks_remaining_.load() > 0) + { + page_result result = get_result(); + ++completed; + if(not result.success) + { + ++errors; + } + + csv_writer.write(mode, num_threads_, render_config_.has_value(), result); + progress.update(completed); + } + progress.finish(); + + for(auto& worker : workers_) + { + if(worker.joinable()) + { + worker.join(); + } + } + + const double elapsed = std::chrono::duration(clock_type::now() - start).count(); + return benchmark_result{num_threads_, elapsed, errors}; + } + + private: + void worker_loop() + { + while(true) + { + const std::size_t task_index = next_task_.fetch_add(1); + if(task_index >= tasks_.size()) + { + break; + } + + const page_task task = tasks_[task_index]; + page_result result; + result.doc_key = schedule_[task.doc_index].path.string(); + result.page_number = task.page_number; + + try + { + auto total_start = clock_type::now(); + + auto stage_start = clock_type::now(); + auto page_decoder = docs_[task.doc_index]->make_thread_safe_page_decoder(task.page_number); + result.timings.make_page_decoder_s = + std::chrono::duration(clock_type::now() - stage_start).count(); + + stage_start = clock_type::now(); + page_decoder->decode_page(decode_config_); + result.timings.decode_page_s = + std::chrono::duration(clock_type::now() - stage_start).count(); + + if(decode_config_.create_word_cells) + { + stage_start = clock_type::now(); + page_decoder->create_word_cells(decode_config_); + result.timings.create_word_cells_s = + std::chrono::duration(clock_type::now() - stage_start).count(); + } + + if(decode_config_.create_line_cells) + { + stage_start = clock_type::now(); + page_decoder->create_line_cells(decode_config_); + result.timings.create_line_cells_s = + std::chrono::duration(clock_type::now() - stage_start).count(); + } + + if(render_config_.has_value()) + { + stage_start = clock_type::now(); + pdflib::renderer rnd(*render_config_); + page_decoder->get_instructions().iterate_over_instructions(rnd); + result.timings.render_page_s = + std::chrono::duration(clock_type::now() - stage_start).count(); + result.image_data = rnd.get_canvas(); + } + + result.timings.total_s = + std::chrono::duration(clock_type::now() - total_start).count(); + result.success = true; + result.page_decoder = page_decoder; + } + catch(const std::exception& exc) + { + result.success = false; + result.error_message = exc.what(); + } + + push_result(std::move(result)); + } + + active_workers_.fetch_sub(1); + results_available_.notify_all(); + } + + void push_result(page_result result) + { + std::unique_lock lock(results_mutex_); + results_consumed_.wait(lock, [this]() { + return static_cast(results_.size()) < max_concurrent_results_; + }); + + results_.push(std::move(result)); + lock.unlock(); + results_available_.notify_one(); + } + + page_result get_result() + { + std::unique_lock lock(results_mutex_); + results_available_.wait(lock, [this]() { + return not results_.empty() or active_workers_.load() == 0; + }); + + if(results_.empty()) + { + page_result result; + result.success = false; + result.error_message = "no result available"; + return result; + } + + page_result result = std::move(results_.front()); + results_.pop(); + tasks_remaining_.fetch_sub(1); + lock.unlock(); + results_consumed_.notify_one(); + return result; + } + + private: + const std::vector& schedule_; + const std::vector& docs_; + int num_threads_; + int max_concurrent_results_; + pdflib::decode_config decode_config_; + std::optional render_config_; + + std::vector tasks_; + std::atomic next_task_{0}; + std::atomic tasks_remaining_{0}; + std::atomic active_workers_{0}; + + std::queue results_; + std::mutex results_mutex_; + std::condition_variable results_available_; + std::condition_variable results_consumed_; + + std::vector workers_; + }; + + void print_decode_config(const pdflib::decode_config& config) + { + std::cout << "Decode config:\n" << config.to_string() << "\n"; + } + + void print_render_config(const pdflib::render_config& config) + { + std::cout << "Render config:\n" + << std::left + << std::setw(32) << "parameter" << "value\n" + << std::string(44, '-') << "\n" + << std::setw(32) << "render_text" << config.render_text << "\n" + << std::setw(32) << "draw_text_bbox" << config.draw_text_bbox << "\n" + << std::setw(32) << "draw_text_basepoint" << config.draw_text_basepoint << "\n" + << std::setw(32) << "fit_glyph_bbox_to_target" << config.fit_glyph_bbox_to_target << "\n" + << std::setw(32) << "resolve_fonts" << config.resolve_fonts << "\n" + << std::setw(32) << "font_similarity_cutoff" << config.font_similarity_cutoff << "\n" + << std::setw(32) << "scale" << config.scale << "\n" + << std::setw(32) << "canvas_width" << config.canvas_width << "\n" + << std::setw(32) << "canvas_height" << config.canvas_height << "\n"; + } + + void print_table(const std::string& title, + const std::vector& results, + int total_pages) + { + const double t1 = results.empty() ? 0.0 : results.front().wall_time_s; + + std::cout << "\n=== " << title << " ===\n"; + std::cout << std::left + << std::setw(18) << "backend" + << std::right + << std::setw(10) << "threads" + << std::setw(18) << "wall_time (s)" + << std::setw(18) << "vs threaded(1)" + << std::setw(14) << "pages/sec" + << std::setw(12) << "ms/page" + << std::setw(10) << "errors" + << "\n"; + std::cout << std::string(100, '-') << "\n"; + + for(const auto& result : results) + { + const double speedup = result.wall_time_s > 0.0 ? t1 / result.wall_time_s : 0.0; + const double pages_per_sec = result.wall_time_s > 0.0 + ? static_cast(total_pages) / result.wall_time_s : 0.0; + const double ms_per_page = total_pages > 0 + ? 1000.0 * result.wall_time_s / static_cast(total_pages) : 0.0; + + std::ostringstream speedup_ss; + speedup_ss << std::fixed << std::setprecision(2) << speedup << "x"; + + std::cout << std::left + << std::setw(18) << "docling threaded" + << std::right + << std::setw(10) << result.threads + << std::setw(18) << std::fixed << std::setprecision(3) << result.wall_time_s + << std::setw(18) << speedup_ss.str() + << std::setw(14) << std::fixed << std::setprecision(1) << pages_per_sec + << std::setw(12) << std::fixed << std::setprecision(2) << ms_per_page + << std::setw(10) << result.errors + << "\n"; + } + } + + void initialise_fonts() + { + std::string resource_dir = resource_utils::get_resources_dir(false).string(); + nlohmann::json data = nlohmann::json::object({}); + data[pdflib::pdf_resource::RESOURCE_DIR_KEY] = resource_dir; + std::unordered_map font_timings; + pdflib::pdf_resource::initialise(data, font_timings); + } + + cli_options parse_cli(int argc, char* argv[], pdflib::decode_config& decode_config) + { + cxxopts::Options options("run_scaling", "Thread-scaling benchmark for docling-parse C++"); + options.add_options() + ("input", "Local PDF file or directory", cxxopts::value()) + ("mode", "Benchmark stage: parse, render, or both", cxxopts::value()->default_value("render")) + ("recursive,r", "Recurse into subdirectories", cxxopts::value()->default_value("false")->implicit_value("true")) + ("max-pages,l", "Maximum number of pages to process across all input PDFs", cxxopts::value()) + ("max-concurrent-results", "Max buffered results for threaded processing", cxxopts::value()->default_value("64")) + ("threads", "Comma-separated thread counts", cxxopts::value()->default_value("1,2,4,8,12,16")) + ("scale", "Render scale for render mode", cxxopts::value()->default_value("1.0")) + ("enable-timing", "Write one CSV timing row per page result", cxxopts::value()->default_value("false")->implicit_value("true")) + ("timing-csv", "CSV path used when --enable-timing is set", cxxopts::value()->default_value("timing-cpp.csv")) + ("loglevel", "Log level [fatal, error, warning, info]", cxxopts::value()->default_value("fatal")) + ("page-boundary", "Page boundary [crop_box, media_box]", cxxopts::value()) + ("do-sanitization", "Run post-parse sanitization", cxxopts::value()) + ("keep-char-cells", "Keep individual character cells", cxxopts::value()) + ("keep-shapes", "Keep shape items", cxxopts::value()) + ("keep-bitmaps", "Keep bitmap items", cxxopts::value()) + ("max-num-lines", "Cap on number of lines per page", cxxopts::value()) + ("max-num-bitmaps", "Cap on number of bitmaps per page", cxxopts::value()) + ("create-word-cells", "Build word-level cells", cxxopts::value()) + ("create-line-cells", "Build line-level cells", cxxopts::value()) + ("enforce-same-font", "Require same font within a word/line cell", cxxopts::value()) + ("horizontal-cell-tolerance", "Horizontal merge tolerance", cxxopts::value()) + ("word-space-factor", "Space-width factor for word merging", cxxopts::value()) + ("line-space-factor", "Space-width factor for line merging", cxxopts::value()) + ("line-space-factor-with-space", "Space-width factor for line merging with space", cxxopts::value()) + ("keep-glyphs", "Keep unmapped GLYPH<...> tokens", cxxopts::value()) + ("keep-qpdf-warnings", "Emit QPDF warnings", cxxopts::value()) + ("materialize-bitmap-bytes", "Print-only parity with Python config; C++ ignores it", cxxopts::value()) + ("h,help", "Print usage"); + + options.parse_positional({"input"}); + options.positional_help("input"); + + auto result = options.parse(argc, argv); + if(result.count("help") or not result.count("input")) + { + std::cout << options.help() << "\n"; + std::exit(result.count("help") ? 0 : 1); + } + + cli_options cli; + cli.input = result["input"].as(); + cli.recursive = result["recursive"].as(); + cli.max_concurrent_results = result["max-concurrent-results"].as(); + if(cli.max_concurrent_results <= 0) + { + throw std::runtime_error("--max-concurrent-results must be positive"); + } + cli.threads = parse_thread_counts(result["threads"].as()); + cli.scale = result["scale"].as(); + cli.enable_timing = result["enable-timing"].as(); + cli.timing_csv = result["timing-csv"].as(); + cli.loglevel = result["loglevel"].as(); + + if(result.count("max-pages")) + { + cli.max_pages = result["max-pages"].as(); + } + + std::string raw_mode = result["mode"].as(); + std::transform(raw_mode.begin(), raw_mode.end(), raw_mode.begin(), + [](unsigned char c) { return std::tolower(c); }); + if(raw_mode == "parse") + { + cli.mode = run_mode::parse; + } + else if(raw_mode == "render") + { + cli.mode = run_mode::render; + } + else if(raw_mode == "both") + { + cli.mode = run_mode::both; + } + else + { + throw std::runtime_error("--mode must be one of parse, render, both"); + } + + if(result.count("page-boundary")) { decode_config.page_boundary = result["page-boundary"].as(); } + if(result.count("do-sanitization")) { decode_config.do_sanitization = parse_bool(result["do-sanitization"].as()); } + if(result.count("keep-char-cells")) { decode_config.keep_char_cells = parse_bool(result["keep-char-cells"].as()); } + if(result.count("keep-shapes")) { decode_config.keep_shapes = parse_bool(result["keep-shapes"].as()); } + if(result.count("keep-bitmaps")) { decode_config.keep_bitmaps = parse_bool(result["keep-bitmaps"].as()); } + if(result.count("max-num-lines")) { decode_config.max_num_lines = result["max-num-lines"].as(); } + if(result.count("max-num-bitmaps")) { decode_config.max_num_bitmaps = result["max-num-bitmaps"].as(); } + if(result.count("create-word-cells")) { decode_config.create_word_cells = parse_bool(result["create-word-cells"].as()); } + if(result.count("create-line-cells")) { decode_config.create_line_cells = parse_bool(result["create-line-cells"].as()); } + if(result.count("enforce-same-font")) { decode_config.enforce_same_font = parse_bool(result["enforce-same-font"].as()); } + if(result.count("horizontal-cell-tolerance")) { decode_config.horizontal_cell_tolerance = result["horizontal-cell-tolerance"].as(); } + if(result.count("word-space-factor")) { decode_config.word_space_width_factor_for_merge = result["word-space-factor"].as(); } + if(result.count("line-space-factor")) { decode_config.line_space_width_factor_for_merge = result["line-space-factor"].as(); } + if(result.count("line-space-factor-with-space")) { decode_config.line_space_width_factor_for_merge_with_space = result["line-space-factor-with-space"].as(); } + if(result.count("keep-glyphs")) { decode_config.keep_glyphs = parse_bool(result["keep-glyphs"].as()); } + if(result.count("keep-qpdf-warnings")) { decode_config.keep_qpdf_warnings = parse_bool(result["keep-qpdf-warnings"].as()); } + if(result.count("materialize-bitmap-bytes")) { decode_config.materialize_bitmap_bytes = parse_bool(result["materialize-bitmap-bytes"].as()); } + + decode_config.do_thread_safe = true; + return cli; + } +} + +int main(int argc, char* argv[]) +{ + loguru::init(argc, argv); + + try + { + pdflib::decode_config decode_config = default_decode_config(); + cli_options cli = parse_cli(argc, argv, decode_config); + set_loglevel(cli.loglevel); + initialise_fonts(); + + auto pdfs = find_pdfs(cli.input, cli.recursive); + if(pdfs.empty()) + { + std::cerr << "No PDFs found for input: " << cli.input << "\n"; + return 2; + } + + int total_pages = 0; + auto schedule = make_schedule(pdfs, cli.max_pages, total_pages); + if(schedule.empty() or total_pages <= 0) + { + std::cerr << "No pages selected for benchmarking\n"; + return 2; + } + + std::cout << "Benchmark: " << schedule.size() + << " documents, " << total_pages << " total pages\n"; + std::cout << "Mode: " << mode_to_string(cli.mode) << "\n"; + std::cout << "Thread counts to test: "; + for(std::size_t i = 0; i < cli.threads.size(); ++i) + { + if(i > 0) { std::cout << ","; } + std::cout << cli.threads[i]; + } + std::cout << "\n"; + std::cout << "Max concurrent results: " << cli.max_concurrent_results << "\n"; + if(cli.mode == run_mode::render or cli.mode == run_mode::both) + { + std::cout << "Render scale: " << cli.scale << "\n"; + } + std::cout << "\n"; + + print_decode_config(decode_config); + auto render_config = default_render_config(cli.scale); + if(cli.mode == run_mode::render or cli.mode == run_mode::both) + { + print_render_config(render_config); + } + + std::cout << "\nLoading documents ...\n"; + auto docs = load_documents(schedule); + + std::vector modes; + if(cli.mode == run_mode::both) + { + modes = {run_mode::parse, run_mode::render}; + } + else + { + modes = {cli.mode}; + } + + for(run_mode mode : modes) + { + const bool render = (mode == run_mode::render); + const std::string title = render ? "RENDER (decode + rasterise)" : "PARSE (decode only)"; + std::cout << "\n##### " << title << " #####\n"; + + std::vector results; + for(int threads : cli.threads) + { + std::cout << "Running threaded " + << (render ? "renderer" : "parser") + << " with " << threads << " threads ...\n"; + + threaded_benchmark benchmark(schedule, + docs, + threads, + cli.max_concurrent_results, + decode_config, + render ? std::optional(render_config) + : std::nullopt); + benchmark_result result = benchmark.run(render ? "render" : "parse", + cli.enable_timing, + cli.timing_csv); + results.push_back(result); + std::cout << " threads=" << threads + << ": " << std::fixed << std::setprecision(3) + << result.wall_time_s << "s"; + if(result.errors > 0) + { + std::cout << " (" << result.errors << " errors)"; + } + std::cout << "\n"; + } + + print_table(title, results, total_pages); + } + } + catch(const cxxopts::exceptions::exception& exc) + { + std::cerr << "Error parsing options: " << exc.what() << "\n"; + return 1; + } + catch(const std::exception& exc) + { + std::cerr << "Error: " << exc.what() << "\n"; + return 1; + } + + return 0; +} diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 21b66d40..0cdaeaed 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -183,6 +183,40 @@ def decode_page_keys() -> List[str]: return get_decode_page_timing_keys() +class PageDecodeTimings(BaseModel): + """Top-level timing breakdown for a threaded page decode task.""" + + model_config = ConfigDict(validate_assignment=True) + + make_page_decoder_s: float = 0.0 + decode_page_s: float = 0.0 + create_word_cells_s: float = 0.0 + create_line_cells_s: float = 0.0 + total_s: float = 0.0 + + +class PageRenderTimings(PageDecodeTimings): + """Top-level timing breakdown for a threaded page render task.""" + + render_page_s: float = 0.0 + + +def _page_timings_from_raw(raw_timings) -> PageDecodeTimings | PageRenderTimings: + data = { + "make_page_decoder_s": raw_timings.make_page_decoder_s, + "decode_page_s": raw_timings.decode_page_s, + "create_word_cells_s": raw_timings.create_word_cells_s, + "create_line_cells_s": raw_timings.create_line_cells_s, + "total_s": raw_timings.total_s, + } + if hasattr(raw_timings, "render_page_s"): + return PageRenderTimings( + **data, + render_page_s=raw_timings.render_page_s, + ) + return PageDecodeTimings(**data) + + def _to_bounding_rectangle( bbox: tuple[float, float, float, float], ) -> BoundingRectangle: @@ -961,11 +995,13 @@ def __init__( if self.success: self._page_decoder, _ = raw_result.get() self._timings = _timings_from_decoder(self._page_decoder) + self.timings = _page_timings_from_raw(raw_result.timings) self.page_width, self.page_height = _page_size_from_decoder( self._page_decoder, boundary_type ) else: self._timings = Timings() + self.timings = _page_timings_from_raw(raw_result.timings) self.page_width = 0.0 self.page_height = 0.0 diff --git a/perf/run_scaling.py b/perf/run_scaling.py index 843e917c..723c7d32 100644 --- a/perf/run_scaling.py +++ b/perf/run_scaling.py @@ -28,19 +28,27 @@ from __future__ import annotations import argparse +import csv import sys import time +from datetime import datetime from pathlib import Path from typing import List, Tuple +from docling_core.types.doc.page import SegmentedPdfPage +from PIL import Image as PILImage from tabulate import tabulate from tqdm import tqdm - DEFAULT_HF_REPO_ID = "docling-project/performance-dataset-bo767" HF_PDF_SUBDIR = "pdf" +def _default_timing_csv_path() -> Path: + timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") + return Path(f"timing-{timestamp}.csv") + + # -------- Input resolution -------- @@ -79,20 +87,60 @@ def resolve_pdf_inputs(input_str: str, recursive: bool = False) -> List[Path]: return find_pdfs(pdf_dir, recursive=True) -def count_pages(pdf_paths: List[Path]) -> int: - """Count total pages across all PDFs using DoclingPdfParser.""" +def page_counts(pdf_paths: List[Path]) -> List[Tuple[Path, int]]: + """Count pages per PDF using DoclingPdfParser.""" from docling_parse.pdf_parser import DoclingPdfParser parser = DoclingPdfParser(loglevel="fatal") - total = 0 + counts: List[Tuple[Path, int]] = [] for pdf_path in tqdm(pdf_paths, desc="counting pages", unit="doc"): try: d = parser.load(str(pdf_path), lazy=True) - total += d.number_of_pages() + counts.append((pdf_path, d.number_of_pages())) d.unload() except Exception: pass - return total + return counts + + +def apply_max_pages( + pdf_paths: List[Path], max_pages: int | None +) -> Tuple[List[Tuple[Path, List[int] | None]], int]: + """Apply an exact total-page cap across PDFs in input order. + + Returns a schedule of `(pdf_path, page_numbers)` where `page_numbers` is + `None` for all pages in a document, or an explicit 1-indexed subset for the + final truncated document. The second return value is the total scheduled + page count. + """ + counts = page_counts(pdf_paths) + if max_pages is None: + return [(pdf_path, None) for pdf_path, _ in counts], sum( + count for _, count in counts + ) + + if max_pages <= 0: + return [], 0 + + schedule: List[Tuple[Path, List[int] | None]] = [] + remaining = max_pages + total = 0 + + for pdf_path, count in counts: + if remaining <= 0: + break + if count <= remaining: + schedule.append((pdf_path, None)) + remaining -= count + total += count + else: + page_numbers = list(range(1, remaining + 1)) + schedule.append((pdf_path, page_numbers)) + total += remaining + remaining = 0 + break + + return schedule, total # -------- Decode config helper -------- @@ -102,19 +150,131 @@ def _decode_config(): from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import] c = DecodePageConfig() - c.keep_char_cells = False + c.keep_char_cells = True c.keep_shapes = False - c.keep_bitmaps = True + # c.keep_bitmaps = True + c.keep_bitmaps = False c.materialize_bitmap_bytes = False c.create_word_cells = False - c.create_line_cells = True + # c.create_line_cells = True + c.create_line_cells = False return c +def _config_rows(config, fields: List[str]) -> List[List[str]]: + return [[field, getattr(config, field)] for field in fields] + + +def _print_run_configs(*, render: bool, scale: float) -> None: + from docling_parse.pdf_parsers import RenderConfig # type: ignore[import] + + decode_config = _decode_config() + decode_fields = [ + "page_boundary", + "do_sanitization", + "keep_char_cells", + "keep_shapes", + "keep_bitmaps", + "max_num_lines", + "max_num_bitmaps", + "create_word_cells", + "create_line_cells", + "enforce_same_font", + "horizontal_cell_tolerance", + "word_space_width_factor_for_merge", + "line_space_width_factor_for_merge", + "line_space_width_factor_for_merge_with_space", + "do_thread_safe", + "release_native_memory_every_n_pages", + "keep_glyphs", + "keep_qpdf_warnings", + "materialize_bitmap_bytes", + ] + print("Decode config:") + print( + tabulate( + _config_rows(decode_config, decode_fields), + headers=["parameter", "value"], + ) + ) + print() + + print("Render config:") + if not render: + print(tabulate([["enabled", False]], headers=["parameter", "value"])) + return + + render_config = RenderConfig() + render_config.scale = scale + render_fields = [ + "render_text", + "draw_text_bbox", + "draw_text_basepoint", + "fit_glyph_bbox_to_target", + "resolve_fonts", + "font_similarity_cutoff", + "scale", + "canvas_width", + "canvas_height", + ] + print( + tabulate( + _config_rows(render_config, render_fields), + headers=["parameter", "value"], + ) + ) + + +def _timing_csv_fieldnames() -> List[str]: + return [ + "mode", + "threads", + "render", + "doc_key", + "page_number", + "success", + "timing_total_s", + "timing_make_page_decoder_s", + "timing_decode_page_s", + "timing_create_word_cells_s", + "timing_create_line_cells_s", + "timing_render_page_s", + "error_message", + ] + + +def _timing_csv_row( + *, mode: str, num_threads: int, render: bool, result +) -> dict[str, object]: + row: dict[str, object] = { + "mode": mode, + "threads": num_threads, + "render": render, + "doc_key": result.doc_key, + "page_number": result.page_number, + "success": result.success, + "error_message": result.error_message, + } + timing_keys = _timing_csv_fieldnames()[7:-1] + if result.success: + timings = result.timings + row["timing_total_s"] = timings.total_s + row["timing_make_page_decoder_s"] = timings.make_page_decoder_s + row["timing_decode_page_s"] = timings.decode_page_s + row["timing_create_word_cells_s"] = timings.create_word_cells_s + row["timing_create_line_cells_s"] = timings.create_line_cells_s + row["timing_render_page_s"] = getattr(timings, "render_page_s", 0.0) + else: + row["timing_total_s"] = 0.0 + for key in timing_keys: + row[key] = 0.0 + return row + + # -------- Baselines -------- -def run_sequential_parse(pdf_paths: List[Path]) -> float: +def run_sequential_parse(pdf_schedule: List[Tuple[Path, List[int] | None]]) -> float: """Sequential DoclingPdfParser decode (no render). Returns wall time in seconds.""" from docling_parse.pdf_parser import DoclingPdfParser @@ -124,18 +284,26 @@ def run_sequential_parse(pdf_paths: List[Path]) -> float: parser = DoclingPdfParser(loglevel="fatal") t0 = time.perf_counter() - for pdf_path in tqdm(pdf_paths, desc=" sequential parse", unit="doc", leave=False): + for pdf_path, page_numbers in tqdm( + pdf_schedule, desc=" sequential parse", unit="doc", leave=False + ): try: doc = parser.load(str(pdf_path), lazy=True) - for _, _ in doc.iterate_pages(config=config): - pass + if page_numbers is None: + for _, _ in doc.iterate_pages(config=config): + pass + else: + for page_number in page_numbers: + _ = doc.get_page(page_number, config=config) doc.unload() except Exception as e: print(f" sequential error on {pdf_path}: {e}") return time.perf_counter() - t0 -def run_pypdfium_parse(pdf_paths: List[Path], total_pages: int) -> float: +def run_pypdfium_parse( + pdf_schedule: List[Tuple[Path, List[int] | None]], total_pages: int +) -> float: """Single-threaded pypdfium2 text extraction.""" try: import pypdfium2 as pdfium # type: ignore @@ -146,7 +314,7 @@ def run_pypdfium_parse(pdf_paths: List[Path], total_pages: int) -> float: t0 = time.perf_counter() errors = 0 with tqdm(total=total_pages, desc=" pypdfium2-parse", unit="page") as pbar: - for pdf_path in pdf_paths: + for pdf_path, page_numbers in pdf_schedule: try: doc = pdfium.PdfDocument(str(pdf_path)) except Exception as e: @@ -154,7 +322,12 @@ def run_pypdfium_parse(pdf_paths: List[Path], total_pages: int) -> float: errors += 1 continue try: - for i in range(len(doc)): + pages = ( + range(len(doc)) + if page_numbers is None + else (page_number - 1 for page_number in page_numbers) + ) + for i in pages: try: page = doc[i] text_page = page.get_textpage() @@ -177,7 +350,9 @@ def run_pypdfium_parse(pdf_paths: List[Path], total_pages: int) -> float: return time.perf_counter() - t0 -def run_pypdfium_render(pdf_paths: List[Path], total_pages: int) -> float: +def run_pypdfium_render( + pdf_schedule: List[Tuple[Path, List[int] | None]], total_pages: int +) -> float: """Single-threaded pypdfium2: text extract + scale=2 render to PIL.""" try: import pypdfium2 as pdfium # type: ignore @@ -188,7 +363,7 @@ def run_pypdfium_render(pdf_paths: List[Path], total_pages: int) -> float: t0 = time.perf_counter() errors = 0 with tqdm(total=total_pages, desc=" pypdfium2-render", unit="page") as pbar: - for pdf_path in pdf_paths: + for pdf_path, page_numbers in pdf_schedule: try: doc = pdfium.PdfDocument(str(pdf_path)) except Exception as e: @@ -196,7 +371,12 @@ def run_pypdfium_render(pdf_paths: List[Path], total_pages: int) -> float: errors += 1 continue try: - for i in range(len(doc)): + pages = ( + range(len(doc)) + if page_numbers is None + else (page_number - 1 for page_number in page_numbers) + ) + for i in pages: try: page = doc[i] text_page = page.get_textpage() @@ -222,7 +402,9 @@ def run_pypdfium_render(pdf_paths: List[Path], total_pages: int) -> float: return time.perf_counter() - t0 -def run_pymupdf_parse(pdf_paths: List[Path], total_pages: int) -> float: +def run_pymupdf_parse( + pdf_schedule: List[Tuple[Path, List[int] | None]], total_pages: int +) -> float: """Single-threaded pymupdf text extraction.""" try: import fitz # PyMuPDF @@ -240,7 +422,7 @@ def run_pymupdf_parse(pdf_paths: List[Path], total_pages: int) -> float: t0 = time.perf_counter() errors = 0 with tqdm(total=total_pages, desc=" pymupdf-parse", unit="page") as pbar: - for pdf_path in pdf_paths: + for pdf_path, page_numbers in pdf_schedule: try: doc = fitz.open(str(pdf_path)) except Exception as e: @@ -248,7 +430,8 @@ def run_pymupdf_parse(pdf_paths: List[Path], total_pages: int) -> float: errors += 1 continue try: - for page in doc: + pages = doc if page_numbers is None else (doc[page_number - 1] for page_number in page_numbers) + for page in pages: try: _ = page.get_text("text") except Exception as e: @@ -265,7 +448,9 @@ def run_pymupdf_parse(pdf_paths: List[Path], total_pages: int) -> float: return time.perf_counter() - t0 -def run_pymupdf_render(pdf_paths: List[Path], total_pages: int) -> float: +def run_pymupdf_render( + pdf_schedule: List[Tuple[Path, List[int] | None]], total_pages: int +) -> float: """Single-threaded pymupdf: text extract + scale=2 render to PIL.""" try: import fitz # PyMuPDF @@ -282,7 +467,7 @@ def run_pymupdf_render(pdf_paths: List[Path], total_pages: int) -> float: t0 = time.perf_counter() errors = 0 with tqdm(total=total_pages, desc=" pymupdf-render", unit="page") as pbar: - for pdf_path in pdf_paths: + for pdf_path, page_numbers in pdf_schedule: try: doc = fitz.open(str(pdf_path)) except Exception as e: @@ -290,7 +475,8 @@ def run_pymupdf_render(pdf_paths: List[Path], total_pages: int) -> float: errors += 1 continue try: - for page in doc: + pages = doc if page_numbers is None else (doc[page_number - 1] for page_number in page_numbers) + for page in pages: try: _ = page.get_text("text") pix = page.get_pixmap(matrix=matrix) @@ -339,13 +525,15 @@ def parse_other_arg(arg: str) -> List[str]: def run_threaded( - pdf_paths: List[Path], + pdf_schedule: List[Tuple[Path, List[int] | None]], num_threads: int, max_concurrent_results: int, total_pages: int, *, render: bool, scale: float, + enable_timing: bool, + timing_csv: Path, ) -> float: """Run DoclingThreadedPdfParser; render=True enables rasterisation.""" from docling_parse.pdf_parser import ( @@ -373,26 +561,63 @@ def run_threaded( decode_config=decode_config, ) - for pdf_path in tqdm(pdf_paths, desc=" loading", unit="doc", leave=False): + for pdf_path, page_numbers in tqdm( + pdf_schedule, desc=" loading", unit="doc", leave=False + ): try: - parser.load(str(pdf_path)) + parser.load(str(pdf_path), page_numbers=page_numbers) except Exception as e: print(f" threaded load error on {pdf_path}: {e}") desc = " rendering" if render else " parsing" + mode = "render" if render else "parse" t0 = time.perf_counter() errors = 0 - with tqdm(total=total_pages, desc=desc, unit="page") as pbar: - for result in parser.iterate_results(): - if result.success: - if render: - _ = result.get_image() - #_ = result.get_page() + timing_handle = None + timing_writer = None + try: + if enable_timing: + timing_csv.parent.mkdir(parents=True, exist_ok=True) + timing_handle = timing_csv.open("a", newline="", encoding="utf-8") + timing_writer = csv.DictWriter( + timing_handle, + fieldnames=_timing_csv_fieldnames(), + ) + if timing_handle.tell() == 0: + timing_writer.writeheader() + + with tqdm(total=total_pages, desc=desc, unit="page") as pbar: + for result in parser.iterate_results(): + if result.success: + if render: + img: PILImage = result.get_image() + page: SegmentedPdfPage = result.get_page() + + """ + assert len(page.shapes)==0, "len(page.shapes)==0" + assert len(page.char_cells)==0, "len(page.char_cells)==0" + + for br in page.bitmap_resources: + assert br.image==None + """ + else: + page = result.get_page() else: - _ = result.get_page() - else: - errors += 1 - pbar.update(1) + errors += 1 + + if timing_writer is not None: + timing_writer.writerow( + _timing_csv_row( + mode=mode, + num_threads=num_threads, + render=render, + result=result, + ) + ) + pbar.update(1) + finally: + if timing_handle is not None: + timing_handle.close() t1 = time.perf_counter() if errors: print(f" threads={num_threads}: {errors} page errors") @@ -467,7 +692,7 @@ def _row(name: str, threads, t: float) -> List[str]: def _run_one_mode( - pdf_paths: List[Path], + pdf_schedule: List[Tuple[Path, List[int] | None]], thread_counts: List[int], max_concurrent_results: int, total_pages: int, @@ -475,6 +700,8 @@ def _run_one_mode( *, render: bool, scale: float, + enable_timing: bool, + timing_csv: Path, ) -> Tuple[List[Tuple[str, float]], List[Tuple[int, float]]]: baselines: List[Tuple[str, float]] = [] @@ -482,7 +709,7 @@ def _run_one_mode( # (DoclingPdfParser has no rendering path). if not render: print("Running sequential (DoclingPdfParser) ...") - t = run_sequential_parse(pdf_paths) + t = run_sequential_parse(pdf_schedule) print(f" sequential: {t:.3f}s") baselines.append(("sequential docling (1t)", t)) print() @@ -491,7 +718,7 @@ def _run_one_mode( for name in other_backends: fn = OTHER_BACKENDS[name][stage] print(f"Running {name} {stage} reference (1 thread) ...") - t = fn(pdf_paths, total_pages) + t = fn(pdf_schedule, total_pages) print(f" {name}: {t:.3f}s") baselines.append((f"{name} (1t)", t)) print() @@ -501,12 +728,14 @@ def _run_one_mode( for n in thread_counts: print(f"Running threaded {stage_label} with {n} threads ...") t = run_threaded( - pdf_paths, + pdf_schedule, num_threads=n, max_concurrent_results=max_concurrent_results, total_pages=total_pages, render=render, scale=scale, + enable_timing=enable_timing, + timing_csv=timing_csv, ) threaded_results.append((n, t)) print(f" threads={n}: {t:.3f}s") @@ -542,8 +771,8 @@ def main(argv: List[str]) -> int: help="Recurse into subdirectories (local paths only; HF downloads always recurse)", ) ap.add_argument( - "--limit", "-l", type=int, default=None, - help="Maximum number of documents to process", + "--max-pages", "-l", type=int, default=None, + help="Maximum number of pages to process across all input PDFs", ) ap.add_argument( "--max-concurrent-results", type=int, default=64, @@ -567,6 +796,18 @@ def main(argv: List[str]) -> int: 'Default: "pypdfium2". Use "" to skip.' ), ) + ap.add_argument( + "--enable-timing", + action=argparse.BooleanOptionalAction, + default=False, + help="Write one CSV timing row per page result (default: disabled)", + ) + ap.add_argument( + "--timing-csv", + type=Path, + default=_default_timing_csv_path(), + help="CSV path used when --enable-timing is set", + ) args = ap.parse_args(argv) @@ -575,15 +816,16 @@ def main(argv: List[str]) -> int: other_backends = parse_other_arg(args.other) pdfs = resolve_pdf_inputs(args.input, recursive=args.recursive) - if args.limit is not None: - pdfs = pdfs[: args.limit] if not pdfs: print(f"No PDFs found for input: {args.input}", file=sys.stderr) return 2 - total_pages = count_pages(pdfs) + pdf_schedule, total_pages = apply_max_pages(pdfs, args.max_pages) + if not pdf_schedule or total_pages <= 0: + print("No pages selected for benchmarking", file=sys.stderr) + return 2 - print(f"Benchmark: {len(pdfs)} documents, {total_pages} total pages") + print(f"Benchmark: {len(pdf_schedule)} documents, {total_pages} total pages") print(f"Mode: {args.mode}") print(f"Thread counts to test: {thread_counts}") print(f"Max concurrent results: {args.max_concurrent_results}") @@ -591,6 +833,8 @@ def main(argv: List[str]) -> int: if args.mode in ("render", "both"): print(f"Render scale: {args.scale}") print() + _print_run_configs(render=args.mode in ("render", "both"), scale=args.scale) + print() modes_to_run = ["parse", "render"] if args.mode == "both" else [args.mode] for m in modes_to_run: @@ -598,13 +842,15 @@ def main(argv: List[str]) -> int: title = "RENDER (decode + rasterise)" if render else "PARSE (decode only)" print(f"\n##### {title} #####") baselines, threaded_results = _run_one_mode( - pdfs, + pdf_schedule, thread_counts, args.max_concurrent_results, total_pages, other_backends, render=render, scale=args.scale, + enable_timing=args.enable_timing, + timing_csv=args.timing_csv, ) _print_table(title, baselines, threaded_results, total_pages) diff --git a/perf/run_scaling_visualization.py b/perf/run_scaling_visualization.py new file mode 100644 index 00000000..cd68be9d --- /dev/null +++ b/perf/run_scaling_visualization.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +""" +Visualize per-page total timings from perf/run_scaling.py CSV output. + +This script reads the timing CSV written by `perf/run_scaling.py +--enable-timing` and creates one histogram per thread count, using the +top-level `timing_total_s` column. + +Usage: + python perf/run_scaling_visualization.py timing-2026-05-28-53-19.csv + python perf/run_scaling_visualization.py timing.csv --mode render --bins 80 +""" + +from __future__ import annotations + +import argparse +import csv +import math +import time +from collections import defaultdict +from pathlib import Path +from typing import Dict, List + +import matplotlib +import numpy as np + +matplotlib.use("Agg") +import matplotlib.pyplot as plt + + +def timestamped_out_dir() -> Path: + ts = time.strftime("%Y%m%d-%H%M%S") + return Path("perf") / "results" / f"scaling_viz_{ts}" + + +def read_timings(path: Path, mode: str) -> Dict[int, List[float]]: + per_threads: Dict[int, List[float]] = defaultdict(list) + with path.open("r", newline="", encoding="utf-8") as f: + reader = csv.DictReader(f) + for row in reader: + if mode != "both" and row.get("mode") != mode: + continue + + success = str(row.get("success", "")).strip().lower() in {"1", "true"} + if not success: + continue + + try: + threads = int(row["threads"]) + total_s = float(row["timing_total_s"]) + except Exception: + continue + + if math.isfinite(total_s) and total_s >= 0.0: + per_threads[threads].append(total_s) + + return dict(sorted(per_threads.items())) + + +def plot_histograms( + per_threads: Dict[int, List[float]], out_dir: Path, bins: int, mode: str +) -> None: + out_dir.mkdir(parents=True, exist_ok=True) + + filtered: List[tuple[int, List[float]]] = [] + for threads, values in per_threads.items(): + positive_values = [value for value in values if value > 0.0] + if not positive_values: + continue + filtered.append((threads, positive_values)) + + if not filtered: + return + + global_min = min(min(values) for _, values in filtered) + global_max = max(max(values) for _, values in filtered) + if global_min == global_max: + global_min *= 0.5 + global_max *= 2.0 + + bin_edges = np.logspace(np.log10(global_min), np.log10(global_max), bins + 1) + + fig, axes = plt.subplots( + nrows=len(filtered), + ncols=1, + figsize=(9, 3.2 * len(filtered)), + sharex=True, + squeeze=False, + ) + + for ax, (threads, values) in zip(axes.flat, filtered): + ax.hist(values, bins=bin_edges, color="#1f77b4", alpha=0.85) + ax.set_xscale("log") + ax.set_ylabel("count") + ax.set_title(f"threads={threads} (n={len(values)})") + + axes[-1, 0].set_xlabel("total time / page (s)") + fig.suptitle(f"Per-page total timing histograms — mode={mode}", y=0.995) + fig.tight_layout() + fig.savefig(out_dir / "hist_stacked.png", dpi=160) + plt.close(fig) + + +def main() -> int: + ap = argparse.ArgumentParser( + description="Visualize per-page total timings from perf/run_scaling.py CSV output" + ) + ap.add_argument("timing_csv", type=Path, help="Timing CSV from perf/run_scaling.py") + ap.add_argument("--mode", choices=["parse", "render", "both"], default="both") + ap.add_argument("--bins", type=int, default=50) + ap.add_argument("--out-dir", type=Path, default=timestamped_out_dir()) + args = ap.parse_args() + + per_threads = read_timings(args.timing_csv, args.mode) + if not per_threads: + raise SystemExit("No timing rows matched the requested filters") + + plot_histograms(per_threads, args.out_dir, args.bins, args.mode) + print(f"Wrote histograms to {args.out_dir}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/parse/pdf_decoders/document.h b/src/parse/pdf_decoders/document.h index 28a26399..9072bfb2 100644 --- a/src/parse/pdf_decoders/document.h +++ b/src/parse/pdf_decoders/document.h @@ -64,7 +64,7 @@ namespace pdflib private: - std::shared_ptr get_thread_safe_page_buffer(int page_ind); + std::pair > get_thread_safe_page_buffer(int page_ind); void ensure_annots_loaded(); @@ -266,14 +266,26 @@ namespace pdflib return true; } - std::shared_ptr pdf_decoder::get_thread_safe_page_buffer(int page_ind) + std::pair > pdf_decoder::get_thread_safe_page_buffer(int page_ind) { + std::pair > result(-1, nullptr); + + /* + if(not qpdf_document.anyWarnings()) + { + result.first = page_ind; + result.second = buffer; + + return result; + } + */ + // Thread-safe decoding uses standalone one-page PDF buffers. // Page extraction and serialization are intentionally serialized, and // the mutex is expected to remain held across QPDFWriter::write(). std::lock_guard lock(thread_safe_buffer_mutex); - - std::shared_ptr result = nullptr; + + //std::shared_ptr result = nullptr; QPDFObjectHandle qpdf_page = qpdf_pages.at(page_ind); @@ -300,7 +312,8 @@ namespace pdflib auto out = writer.getBufferSharedPointer(); - result = std::make_shared(reinterpret_cast(out->getBuffer()), + result.first = 0; + result.second = std::make_shared(reinterpret_cast(out->getBuffer()), out->getSize()); LOG_S(INFO) << "writing a pdf-page buffer in " << page_timer.get_time() << " [sec]"; @@ -312,11 +325,17 @@ namespace pdflib pdf_decoder::page_decoder_ptr pdf_decoder::make_thread_safe_page_decoder(int page_number) { - std::shared_ptr page_buffer = get_thread_safe_page_buffer(page_number); + std::pair > result = get_thread_safe_page_buffer(page_number); + + int orig_page_number = page_number; + int curr_page_number = result.first; + + std::shared_ptr page_buffer = result.second; + return std::make_shared>(page_buffer, password, - page_number, - 0); + orig_page_number, + curr_page_number); } void pdf_decoder::decode_document(const decode_config& config) diff --git a/src/parse/pdf_states/text.h b/src/parse/pdf_states/text.h index e4af9e7a..29c82d43 100644 --- a/src/parse/pdf_states/text.h +++ b/src/parse/pdf_states/text.h @@ -495,6 +495,11 @@ namespace pdflib const bool need_base_text_cells = config.keep_char_cells or config.create_word_cells or config.create_line_cells; + if(not need_base_text_cells) + { + return; + } + LOG_S(INFO) << __FUNCTION__ << " with text='" << text << "', width=" << width << " from base-font: " << font.get_base_font() << ", font-key: " << font.get_key(); bool left_to_right = (not utils::string::is_right_to_left(text)); diff --git a/src/pybind/docling_threaded_base.h b/src/pybind/docling_threaded_base.h index 26c40f37..29318ba9 100644 --- a/src/pybind/docling_threaded_base.h +++ b/src/pybind/docling_threaded_base.h @@ -22,20 +22,12 @@ #include #include +#include #include namespace docling { - struct page_decode_result - { - std::string doc_key; - int page_number; - bool success; - std::string error_message; - std::shared_ptr> page_decoder; - }; - // --------------------------------------------------------------------------- // docling_threaded_base // diff --git a/src/pybind/docling_threaded_parser.h b/src/pybind/docling_threaded_parser.h index 462bcbc2..3eaed16b 100644 --- a/src/pybind/docling_threaded_parser.h +++ b/src/pybind/docling_threaded_parser.h @@ -3,6 +3,8 @@ #ifndef PYBIND_THREADED_PDF_PARSER_H #define PYBIND_THREADED_PDF_PARSER_H +#include + #include namespace docling @@ -27,6 +29,8 @@ namespace docling inline void docling_threaded_parser::worker_loop() { + using clock_type = std::chrono::steady_clock; + while(true) { std::pair task; @@ -61,20 +65,36 @@ namespace docling { auto& doc_decoder = itr->second; + auto total_start = clock_type::now(); + + auto stage_start = clock_type::now(); auto page_decoder = doc_decoder->make_thread_safe_page_decoder(page_number); + result.timings.make_page_decoder_s + = std::chrono::duration(clock_type::now() - stage_start).count(); + stage_start = clock_type::now(); page_decoder->decode_page(config); + result.timings.decode_page_s + = std::chrono::duration(clock_type::now() - stage_start).count(); if(config.create_word_cells) { + stage_start = clock_type::now(); page_decoder->create_word_cells(config); + result.timings.create_word_cells_s + = std::chrono::duration(clock_type::now() - stage_start).count(); } if(config.create_line_cells) { + stage_start = clock_type::now(); page_decoder->create_line_cells(config); + result.timings.create_line_cells_s + = std::chrono::duration(clock_type::now() - stage_start).count(); } + result.timings.total_s + = std::chrono::duration(clock_type::now() - total_start).count(); result.success = true; result.page_decoder = page_decoder; } diff --git a/src/pybind/docling_threaded_renderer.h b/src/pybind/docling_threaded_renderer.h index 70051193..952f7d59 100644 --- a/src/pybind/docling_threaded_renderer.h +++ b/src/pybind/docling_threaded_renderer.h @@ -3,7 +3,7 @@ #ifndef PYBIND_THREADED_PDF_RENDERER_H #define PYBIND_THREADED_PDF_RENDERER_H -#include +#include #include #include @@ -12,15 +12,6 @@ namespace docling { - struct page_render_result : page_decode_result - { - // RGBA pixel data laid out as {height, width, 4} row-major top-to-bottom. - // Suitable for direct consumption by PIL: - // Image.frombuffer("RGBA", (w, h), data, "raw", "RGBA", 0, 1) - std::shared_ptr> image_data; - std::array image_shape{0, 0, 4}; // {height, width, channels} - }; - class docling_threaded_renderer : public docling_threaded_base { @@ -47,6 +38,8 @@ namespace docling inline void docling_threaded_renderer::worker_loop() { + using clock_type = std::chrono::steady_clock; + while(true) { std::pair task; @@ -81,23 +74,42 @@ namespace docling { auto& doc_decoder = itr->second; + auto total_start = clock_type::now(); + + auto stage_start = clock_type::now(); auto page_decoder = doc_decoder->make_thread_safe_page_decoder(page_number); + result.timings.make_page_decoder_s + = std::chrono::duration(clock_type::now() - stage_start).count(); + stage_start = clock_type::now(); page_decoder->decode_page(config); + result.timings.decode_page_s + = std::chrono::duration(clock_type::now() - stage_start).count(); if(config.create_word_cells) { + stage_start = clock_type::now(); page_decoder->create_word_cells(config); + result.timings.create_word_cells_s + = std::chrono::duration(clock_type::now() - stage_start).count(); } if(config.create_line_cells) { + stage_start = clock_type::now(); page_decoder->create_line_cells(config); + result.timings.create_line_cells_s + = std::chrono::duration(clock_type::now() - stage_start).count(); } + stage_start = clock_type::now(); pdflib::renderer rnd(render_cfg); page_decoder->get_instructions().iterate_over_instructions(rnd); + result.timings.render_page_s + = std::chrono::duration(clock_type::now() - stage_start).count(); + result.timings.total_s + = std::chrono::duration(clock_type::now() - total_start).count(); result.success = true; result.page_decoder = page_decoder; result.image_data = rnd.get_canvas(); diff --git a/src/pybind/docling_threaded_results.h b/src/pybind/docling_threaded_results.h new file mode 100644 index 00000000..2583acf6 --- /dev/null +++ b/src/pybind/docling_threaded_results.h @@ -0,0 +1,55 @@ +//-*-C++-*- + +#ifndef PYBIND_DOCLING_THREADED_RESULTS_H +#define PYBIND_DOCLING_THREADED_RESULTS_H + +#include +#include +#include +#include + +#include + +namespace docling +{ + struct page_decode_timings + { + double make_page_decoder_s = 0.0; + double decode_page_s = 0.0; + double create_word_cells_s = 0.0; + double create_line_cells_s = 0.0; + double total_s = 0.0; + }; + + struct page_render_timings : page_decode_timings + { + double render_page_s = 0.0; + }; + + struct page_task_result + { + std::string doc_key; + int page_number = 0; + bool success = false; + std::string error_message; + std::shared_ptr> page_decoder; + }; + + struct page_decode_result : page_task_result + { + page_decode_timings timings; + }; + + struct page_render_result : page_task_result + { + page_render_timings timings; + + // RGBA pixel data laid out as {height, width, 4} row-major top-to-bottom. + // Suitable for direct consumption by PIL: + // Image.frombuffer("RGBA", (w, h), data, "raw", "RGBA", 0, 1) + std::shared_ptr> image_data; + std::array image_shape{0, 0, 4}; // {height, width, channels} + }; +} + +#endif diff --git a/tests/test_threaded_parse.py b/tests/test_threaded_parse.py index cdac5c92..befb9c20 100644 --- a/tests/test_threaded_parse.py +++ b/tests/test_threaded_parse.py @@ -123,6 +123,8 @@ def test_threaded_single_document(): assert result.page_width > 0 assert result.page_height > 0 assert result.get_timings().total() > 0 + assert result.timings.total_s > 0 + assert result.timings.decode_page_s > 0 count += 1 assert count == parser.page_count(key) diff --git a/tests/test_threaded_render.py b/tests/test_threaded_render.py index 3aa77ad5..eb18fdac 100644 --- a/tests/test_threaded_render.py +++ b/tests/test_threaded_render.py @@ -115,6 +115,8 @@ def test_render_single_document(): assert image.width > 0 assert image.height > 0 assert result.get_page().dimension.rect is not None + assert result.timings.total_s > 0 + assert result.timings.render_page_s >= 0 count += 1 From b93e8508ea719867c07dd2533b1b80d13cdd28df Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 29 May 2026 12:58:28 +0200 Subject: [PATCH 2/9] updated the renderer with a page specific BLContext Signed-off-by: Peter Staar --- app/run_scaling.cpp | 8 +-- src/render/blend2d_renderer.h | 102 +++++++++++++++++++++++----------- 2 files changed, 75 insertions(+), 35 deletions(-) diff --git a/app/run_scaling.cpp b/app/run_scaling.cpp index 347fdfd2..e4caf34f 100644 --- a/app/run_scaling.cpp +++ b/app/run_scaling.cpp @@ -247,8 +247,8 @@ namespace std::chrono::duration(clock_type::now() - start_).count(); const double rate = elapsed > 0.0 ? static_cast(current) / elapsed : 0.0; - const double eta = (current > 0 and total_ > current) - ? elapsed * (static_cast(total_ - current) / static_cast(current)) + const double total = (current > 0 and total_ > current) + ? elapsed * (static_cast(total_) / static_cast(current)) : 0.0; std::cerr << "\r" << label_ << ": ["; @@ -261,8 +261,8 @@ namespace << std::fixed << std::setprecision(1) << (fraction * 100.0) << "% " << std::fixed << std::setprecision(1) << rate << "/s " - << "elapsed " << format_duration(elapsed) - << " eta " << format_duration(eta) + << "elapsed: " << format_duration(elapsed) << " [sec]" + << " total: " << format_duration(total) << " [sec]" << " " << std::flush; } diff --git a/src/render/blend2d_renderer.h b/src/render/blend2d_renderer.h index b65f0ffa..ad2a1cee 100644 --- a/src/render/blend2d_renderer.h +++ b/src/render/blend2d_renderer.h @@ -66,6 +66,8 @@ namespace pdflib render_config config_; mutable BLImage image_; // internal canvas (PRGB32 format) + mutable BLContext context_; + mutable bool context_active_ = false; std::array shape_; // {height, width, 4} double scale_x_ = 1.0; // pdf-to-canvas scale along x double scale_y_ = 1.0; // pdf-to-canvas scale along y @@ -82,6 +84,9 @@ namespace pdflib // Cache: cache_key → loaded BLFontFace. std::unordered_map font_cache_; + BLContext& page_context(); + void finish_page_context() const; + // Convert PDF coordinates (origin at crop_bbox bottom-left, y-up) to // canvas coordinates (origin top-left, y-down), applying scale. double canvas_x(double pdf_x) const { return (pdf_x - origin_x_) * scale_x_; } @@ -443,12 +448,55 @@ namespace pdflib : config_(config), shape_({0, 0, 4}) {} + inline BLContext& renderer::page_context() + { + if (context_active_) + { + return context_; + } + + if (shape_[0] == 0 or shape_[1] == 0) + { + throw std::runtime_error("renderer::page_context: canvas is empty"); + } + + const BLResult err = context_.begin(image_); + if (err != BL_SUCCESS) + { + throw std::runtime_error( + "renderer::page_context: failed to begin Blend2D context " + "(BLResult=" + std::to_string(err) + ")"); + } + + context_active_ = true; + return context_; + } + + inline void renderer::finish_page_context() const + { + if (not context_active_) + { + return; + } + + const BLResult err = context_.end(); + context_active_ = false; + if (err != BL_SUCCESS) + { + throw std::runtime_error( + "renderer::finish_page_context: failed to end Blend2D context " + "(BLResult=" + std::to_string(err) + ")"); + } + } + // --------------------------------------------------------------------------- // set_size // --------------------------------------------------------------------------- inline void renderer::set_size(size_instruction& instr) { + finish_page_context(); + const auto& bbox = instr.crop_bbox; const int pdf_w = bbox[2] - bbox[0]; const int pdf_h = bbox[3] - bbox[1]; @@ -473,11 +521,19 @@ namespace pdflib image_.create(width, height, BL_FORMAT_PRGB32); // Initialise canvas to opaque white. - BLContext ctx(image_); - ctx.set_comp_op(BL_COMP_OP_SRC_COPY); - ctx.set_fill_style(BLRgba32(0xFFFFFFFFu)); - ctx.fill_all(); - ctx.end(); + const BLResult ctx_res = context_.begin(image_); + if (ctx_res != BL_SUCCESS) + { + throw std::runtime_error( + "renderer::set_size: failed to begin Blend2D context " + "(BLResult=" + std::to_string(ctx_res) + ")"); + } + context_active_ = true; + + context_.set_comp_op(BL_COMP_OP_SRC_COPY); + context_.set_fill_style(BLRgba32(0xFFFFFFFFu)); + context_.fill_all(); + context_.set_comp_op(BL_COMP_OP_SRC_OVER); } // --------------------------------------------------------------------------- @@ -627,7 +683,7 @@ namespace pdflib bbox_path.line_to(x3, y3); bbox_path.close(); - BLContext ctx(image_); + BLContext& ctx = page_context(); LOG_S(INFO) << "writing text: `" << instr.get_text() << "`, size: " << size << ""; if (face.is_valid() and size > 0.5) @@ -655,8 +711,6 @@ namespace pdflib ctx.set_stroke_width(0.5); ctx.stroke_path(bbox_path); } - - ctx.end(); } // --------------------------------------------------------------------------- @@ -723,7 +777,7 @@ namespace pdflib << " font_name=`" << instr.get_font_name() << "`" << " base_font=`" << instr.get_base_font() << "`"; - BLContext ctx(image_); + BLContext& ctx = page_context(); if (face.is_valid() and size > 0.5) { @@ -800,8 +854,6 @@ namespace pdflib ctx.set_stroke_width(0.5); ctx.stroke_path(bbox_path); } - - ctx.end(); } // --------------------------------------------------------------------------- @@ -867,9 +919,7 @@ namespace pdflib << " font_name=`" << instr.get_font_name() << "`" << " base_font=`" << instr.get_base_font() << "`"; - LOG_S(INFO) << "render_text: before BLContext construction"; - BLContext ctx(image_); - LOG_S(INFO) << "render_text: after BLContext construction"; + BLContext& ctx = page_context(); auto draw_bbox_fallback = [&]() { @@ -901,7 +951,6 @@ namespace pdflib << " font_name=`" << instr.get_font_name() << "`" << " base_font=`" << instr.get_base_font() << "`"; draw_bbox_fallback(); - ctx.end(); return; } @@ -931,7 +980,6 @@ namespace pdflib LOG_S(WARNING) << "render_text: apply_transform failed" << " (BLResult=" << transform_res << ")"; draw_bbox_fallback(); - ctx.end(); return; } LOG_S(INFO) << "render_text: before set_fill_style"; @@ -951,7 +999,6 @@ namespace pdflib << " base_font=`" << instr.get_base_font() << "`"; ctx.restore(); draw_bbox_fallback(); - ctx.end(); return; } const auto* placement_data = gb.placement_data(); @@ -960,7 +1007,6 @@ namespace pdflib LOG_S(WARNING) << "render_text: glyph placement data is null, using fallback"; ctx.restore(); draw_bbox_fallback(); - ctx.end(); return; } BLPoint draw_origin(0.0, 0.0); @@ -1067,7 +1113,6 @@ namespace pdflib << " (BLResult=" << translate_res << ")"; ctx.restore(); draw_bbox_fallback(); - ctx.end(); return; } const BLResult scale_res = ctx.scale(bbox_fit_scale); @@ -1077,7 +1122,6 @@ namespace pdflib << " (BLResult=" << scale_res << ")"; ctx.restore(); draw_bbox_fallback(); - ctx.end(); return; } draw_origin.reset(0.0, 0.0); @@ -1094,7 +1138,6 @@ namespace pdflib << " (BLResult=" << text_res << ")" << " text=`" << instr.get_text() << "`"; draw_bbox_fallback(); - ctx.end(); return; } @@ -1122,8 +1165,6 @@ namespace pdflib ctx.set_stroke_width(0.5); ctx.stroke_path(bbox_path); } - - ctx.end(); } // --------------------------------------------------------------------------- @@ -1169,7 +1210,7 @@ namespace pdflib const auto fmt = instr.get_pixel_format(); const auto fill_rgb = instr.get_rgb_filling(); - BLContext ctx(image_); + BLContext& ctx = page_context(); const bool axis_aligned = is_axis_aligned(q); int quarter_turns = -1; const bool right_angle = is_right_angle_rotation(q, quarter_turns); @@ -1198,7 +1239,6 @@ namespace pdflib << " has_data=" << (instr.has_data() ? "true" : "false") << " — drawing semi-transparent yellow placeholder"; render_bitmap_placeholder(ctx, q, axis_aligned); - ctx.end(); return; } @@ -1211,7 +1251,6 @@ namespace pdflib << ") for shape " << sh << "x" << sw << "x" << sc << " — drawing placeholder"; render_bitmap_placeholder(ctx, q, axis_aligned); - ctx.end(); return; } @@ -1336,7 +1375,6 @@ namespace pdflib << ", quarter_turns=" << quarter_turns << ")"; render_bitmap_affine(ctx, src_img, q, sw, sh); } - ctx.end(); } // --------------------------------------------------------------------------- @@ -1359,13 +1397,12 @@ namespace pdflib path.line_to(canvas_x(instr.get_r_x3()), canvas_y(instr.get_r_y3())); path.close(); - BLContext ctx(image_); + BLContext& ctx = page_context(); ctx.set_fill_style(BLRgba32(0x660099FFu)); // A=40%, light blue ctx.fill_path(path); ctx.set_stroke_style(BLRgba32(0xFF0099FFu)); // A=100%, blue border ctx.set_stroke_width(1); ctx.stroke_path(path); - ctx.end(); } // --------------------------------------------------------------------------- @@ -1403,11 +1440,10 @@ namespace pdflib (static_cast(rgb[1]) << 8) | static_cast(rgb[2]); - BLContext ctx(image_); + BLContext& ctx = page_context(); ctx.set_stroke_style(BLRgba32(stroke_color)); ctx.set_stroke_width(1); ctx.stroke_path(path); - ctx.end(); } // --------------------------------------------------------------------------- @@ -1428,6 +1464,8 @@ namespace pdflib return std::make_shared>(); } + finish_page_context(); + BLImageData img_data; image_.get_data(&img_data); @@ -1469,6 +1507,8 @@ namespace pdflib throw std::runtime_error("renderer::save: canvas is empty"); } + finish_page_context(); + const BLResult err = image_.write_to_file(path.c_str()); if (err != BL_SUCCESS) { From 063a6d2c69f3e310c293e43ba1eaae9480886b30 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 29 May 2026 17:46:03 +0200 Subject: [PATCH 3/9] cleaned up the configs (compute C++ and page materialization) Signed-off-by: Peter Staar --- app/pybind_parse.cpp | 2 - app/run_scaling.cpp | 3 - docling_parse/pdf_parser.py | 136 ++++++++++++++++---- perf/run_scaling.py | 237 +++++++++++++++++++++++++++++++---- src/parse/config.h | 8 +- tests/test_parse.py | 71 ++++++----- tests/test_threaded_parse.py | 21 ++-- 7 files changed, 381 insertions(+), 97 deletions(-) diff --git a/app/pybind_parse.cpp b/app/pybind_parse.cpp index 99a4c5bd..a1bffcfc 100644 --- a/app/pybind_parse.cpp +++ b/app/pybind_parse.cpp @@ -276,7 +276,6 @@ PYBIND11_MODULE(pdf_parsers, m) { max_num_bitmaps (int): Maximum number of bitmaps to keep (-1 means no cap) [default=-1]. keep_glyphs (bool): If true, keep GLYPH<...> fallback strings in output; if false, replace them with a space [default=false]. keep_qpdf_warnings (bool): If true, QPDF warnings are emitted; if false, they are suppressed [default=false]. - materialize_bitmap_bytes (bool): If true (default), bitmap byte data is extracted and embedded in BitmapResource objects. If false, only bitmap geometry (rectangles) is preserved and image bytes are skipped. Consumed by the Python layer only; has no effect in C++ [default=true]. )") .def(pybind11::init<>()) .def_readwrite("page_boundary", &pdflib::decode_config::page_boundary) @@ -297,7 +296,6 @@ PYBIND11_MODULE(pdf_parsers, m) { .def_readwrite("release_native_memory_every_n_pages", &pdflib::decode_config::release_native_memory_every_n_pages) .def_readwrite("keep_glyphs", &pdflib::decode_config::keep_glyphs) .def_readwrite("keep_qpdf_warnings", &pdflib::decode_config::keep_qpdf_warnings) - .def_readwrite("materialize_bitmap_bytes", &pdflib::decode_config::materialize_bitmap_bytes) .def("__copy__", [](const pdflib::decode_config& self) { return self; }) .def("__deepcopy__", [](const pdflib::decode_config& self, pybind11::dict) { return self; }); diff --git a/app/run_scaling.cpp b/app/run_scaling.cpp index e4caf34f..6468a3cd 100644 --- a/app/run_scaling.cpp +++ b/app/run_scaling.cpp @@ -295,7 +295,6 @@ namespace config.release_native_memory_every_n_pages = 0; config.keep_glyphs = false; config.keep_qpdf_warnings = false; - config.materialize_bitmap_bytes = false; return config; } @@ -833,7 +832,6 @@ namespace ("line-space-factor-with-space", "Space-width factor for line merging with space", cxxopts::value()) ("keep-glyphs", "Keep unmapped GLYPH<...> tokens", cxxopts::value()) ("keep-qpdf-warnings", "Emit QPDF warnings", cxxopts::value()) - ("materialize-bitmap-bytes", "Print-only parity with Python config; C++ ignores it", cxxopts::value()) ("h,help", "Print usage"); options.parse_positional({"input"}); @@ -901,7 +899,6 @@ namespace if(result.count("line-space-factor-with-space")) { decode_config.line_space_width_factor_for_merge_with_space = result["line-space-factor-with-space"].as(); } if(result.count("keep-glyphs")) { decode_config.keep_glyphs = parse_bool(result["keep-glyphs"].as()); } if(result.count("keep-qpdf-warnings")) { decode_config.keep_qpdf_warnings = parse_bool(result["keep-qpdf-warnings"].as()); } - if(result.count("materialize-bitmap-bytes")) { decode_config.materialize_bitmap_bytes = parse_bool(result["materialize-bitmap-bytes"].as()); } decode_config.do_thread_safe = true; return cli; diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 0cdaeaed..9cc5a66b 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -201,6 +201,35 @@ class PageRenderTimings(PageDecodeTimings): render_page_s: float = 0.0 +class PageMaterializationConfig(BaseModel): + """Controls which native page data is materialized into SegmentedPdfPage.""" + + model_config = ConfigDict(validate_assignment=True) + + materialize_char_cells: bool = True + materialize_word_cells: bool = True + materialize_line_cells: bool = True + materialize_shapes: bool = True + materialize_bitmaps: bool = True + materialize_bitmap_bytes: bool = True + + @classmethod + def from_decode_config( + cls, decode_config: DecodePageConfig + ) -> "PageMaterializationConfig": + return cls() + + def cache_key(self) -> tuple[bool, bool, bool, bool, bool, bool]: + return ( + self.materialize_char_cells, + self.materialize_word_cells, + self.materialize_line_cells, + self.materialize_shapes, + self.materialize_bitmaps, + self.materialize_bitmap_bytes, + ) + + def _page_timings_from_raw(raw_timings) -> PageDecodeTimings | PageRenderTimings: data = { "make_page_decoder_s": raw_timings.make_page_decoder_s, @@ -473,10 +502,17 @@ def _to_bitmap_resources_from_decoder( def segmented_page_from_decoder( page_decoder: PdfPageDecoder, boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, - materialize_bitmap_bytes: bool = True, + materialization_config: PageMaterializationConfig | None = None, ) -> SegmentedPdfPage: """Convert a C++ PdfPageDecoder to a SegmentedPdfPage.""" - char_cells = _to_cells_from_decoder(page_decoder.get_char_cells()) + if materialization_config is None: + materialization_config = PageMaterializationConfig() + + char_cells = ( + _to_cells_from_decoder(page_decoder.get_char_cells()) + if materialization_config.materialize_char_cells + else [] + ) segmented_page = SegmentedPdfPage( dimension=_to_page_geometry_from_decoder( @@ -486,22 +522,30 @@ def segmented_page_from_decoder( word_cells=[], textline_cells=[], has_chars=len(char_cells) > 0, - bitmap_resources=_to_bitmap_resources_from_decoder( - page_decoder.get_page_images(), - materialize_bitmap_bytes=materialize_bitmap_bytes, + bitmap_resources=( + _to_bitmap_resources_from_decoder( + page_decoder.get_page_images(), + materialize_bitmap_bytes=materialization_config.materialize_bitmap_bytes, + ) + if materialization_config.materialize_bitmaps + else [] + ), + shapes=( + _to_shapes_from_decoder(page_decoder.get_page_shapes()) + if materialization_config.materialize_shapes + else [] ), - shapes=_to_shapes_from_decoder(page_decoder.get_page_shapes()), widgets=_to_widgets_from_decoder(page_decoder.get_page_widgets()), hyperlinks=_to_hyperlinks_from_decoder(page_decoder.get_page_hyperlinks()), ) - if page_decoder.has_word_cells(): + if materialization_config.materialize_word_cells and page_decoder.has_word_cells(): segmented_page.word_cells = _to_cells_from_decoder( page_decoder.get_word_cells() ) segmented_page.has_words = len(segmented_page.word_cells) > 0 - if page_decoder.has_line_cells(): + if materialization_config.materialize_line_cells and page_decoder.has_line_cells(): segmented_page.textline_cells = _to_cells_from_decoder( page_decoder.get_line_cells() ) @@ -535,7 +579,9 @@ def __init__( self._parser: pdf_parser = parser self._key = key self._boundary_type = boundary_type - self._pages: Dict[tuple[int, bool], SegmentedPdfPage] = {} + self._pages: Dict[ + tuple[int, tuple[bool, bool, bool, bool, bool, bool]], SegmentedPdfPage + ] = {} self._toc: PdfTableOfContents | None = None self._meta: PdfMetaData | None = None self._annotations: PdfAnnotations | None = None @@ -618,6 +664,7 @@ def iterate_pages( self, *, config: DecodePageConfig | None = None, + materialization_config: PageMaterializationConfig | None = None, ) -> Iterator[Tuple[int, SegmentedPdfPage]]: if config is None: config = self._default_config() @@ -627,6 +674,7 @@ def iterate_pages( self.get_page( page_no + 1, config=config, + materialization_config=materialization_config, ), ) @@ -692,17 +740,23 @@ def get_page( page_no: int, *, config: DecodePageConfig | None = None, + materialization_config: PageMaterializationConfig | None = None, ) -> SegmentedPdfPage: """Get page using typed API (zero-copy from C++).""" if config is None: config = self._default_config() - return self._get_page_typed(page_no, config=config) + if materialization_config is None: + materialization_config = PageMaterializationConfig.from_decode_config(config) + return self._get_page_typed( + page_no, config=config, materialization_config=materialization_config + ) def get_page_with_timings( self, page_no: int, *, config: DecodePageConfig | None = None, + materialization_config: PageMaterializationConfig | None = None, ) -> Tuple[SegmentedPdfPage, Timings]: """Get page along with timing information. @@ -720,6 +774,8 @@ def get_page_with_timings( """ if config is None: config = self._default_config() + if materialization_config is None: + materialization_config = PageMaterializationConfig.from_decode_config(config) if not (1 <= page_no <= self.number_of_pages()): raise ValueError( @@ -727,13 +783,16 @@ def get_page_with_timings( f"(min:1, max:{self.number_of_pages()})" ) - return self._get_page_with_timings_typed(page_no, config=config) + return self._get_page_with_timings_typed( + page_no, config=config, materialization_config=materialization_config + ) def _get_page_with_timings_typed( self, page_no: int, *, config: DecodePageConfig, + materialization_config: PageMaterializationConfig, ) -> Tuple[SegmentedPdfPage, Timings]: """Get page with timings using typed API.""" page_decoder = self._parser.get_page_decoder( @@ -747,7 +806,7 @@ def _get_page_with_timings_typed( segmented_page = self._to_segmented_page_from_decoder( page_decoder=page_decoder, - materialize_bitmap_bytes=config.materialize_bitmap_bytes, + materialization_config=materialization_config, ) # Get timings from the page decoder @@ -757,11 +816,21 @@ def _get_page_with_timings_typed( return segmented_page, timings - def load_all_pages(self, config: DecodePageConfig | None = None): + def load_all_pages( + self, + config: DecodePageConfig | None = None, + materialization_config: PageMaterializationConfig | None = None, + ): if config is None: config = self._default_config() + if materialization_config is None: + materialization_config = PageMaterializationConfig.from_decode_config(config) for page_no in range(1, self.number_of_pages() + 1): - self.get_page(page_no, config=config) + self.get_page( + page_no, + config=config, + materialization_config=materialization_config, + ) def _to_page_geometry_from_decoder(self, page_dim) -> PdfPageGeometry: """Convert typed PdfPageDimension to PdfPageGeometry.""" @@ -799,13 +868,13 @@ def _to_bitmap_resources_from_decoder( def _to_segmented_page_from_decoder( self, page_decoder, - materialize_bitmap_bytes: bool = True, + materialization_config: PageMaterializationConfig | None = None, ) -> SegmentedPdfPage: """Convert typed PdfPageDecoder to SegmentedPdfPage (zero-copy path).""" return segmented_page_from_decoder( page_decoder=page_decoder, boundary_type=self._boundary_type, - materialize_bitmap_bytes=materialize_bitmap_bytes, + materialization_config=materialization_config, ) def _get_page_typed( @@ -813,6 +882,7 @@ def _get_page_typed( page_no: int, *, config: DecodePageConfig, + materialization_config: PageMaterializationConfig, ) -> SegmentedPdfPage: """Get page using typed API (zero-copy from C++, faster than get_page). @@ -826,7 +896,7 @@ def _get_page_typed( Returns: SegmentedPdfPage with the parsed page data. """ - cache_key = (page_no, config.materialize_bitmap_bytes) + cache_key = (page_no, materialization_config.cache_key()) if cache_key in self._pages: return self._pages[cache_key] @@ -842,7 +912,7 @@ def _get_page_typed( self._pages[cache_key] = self._to_segmented_page_from_decoder( page_decoder=page_decoder, - materialize_bitmap_bytes=config.materialize_bitmap_bytes, + materialization_config=materialization_config, ) return self._pages[cache_key] @@ -967,6 +1037,7 @@ class ThreadedPdfParserConfig(BaseModel): max_concurrent_results: int = 32 boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX render_config: RenderConfig | None = None + page_materialization_config: PageMaterializationConfig | None = None class PageParseResult: @@ -979,12 +1050,16 @@ def __init__( boundary_type: PdfPageBoundaryType, render_config: RenderConfig | None, decode_config: DecodePageConfig, + materialization_config: PageMaterializationConfig, ): self._raw = raw_result self._boundary_type = boundary_type self._render_config = render_config self._decode_config = decode_config - self._page: SegmentedPdfPage | None = None + self._materialization_config = materialization_config + self._pages: Dict[ + tuple[bool, bool, bool, bool, bool, bool], SegmentedPdfPage + ] = {} self._page_decoder: PdfPageDecoder | None = None self._default_image: PILImage.Image | None = None @@ -1025,15 +1100,22 @@ def _require_page_decoder(self) -> PdfPageDecoder: assert self._page_decoder is not None return self._page_decoder - def get_page(self) -> SegmentedPdfPage: + def get_page( + self, + materialization_config: PageMaterializationConfig | None = None, + ) -> SegmentedPdfPage: """Return the parsed page, converting lazily on first access.""" - if self._page is None: - self._page = segmented_page_from_decoder( + if materialization_config is None: + materialization_config = self._materialization_config + + cache_key = materialization_config.cache_key() + if cache_key not in self._pages: + self._pages[cache_key] = segmented_page_from_decoder( page_decoder=self._require_page_decoder(), boundary_type=self._boundary_type, - materialize_bitmap_bytes=self._decode_config.materialize_bitmap_bytes, + materialization_config=materialization_config, ) - return self._page + return self._pages[cache_key] def get_timings(self) -> Timings: """Return structured timing data for this page parse.""" @@ -1229,6 +1311,11 @@ def __init__( else DecodePageConfig() ) self._decode_config.page_boundary = parser_config.boundary_type.value + self._materialization_config = ( + parser_config.page_materialization_config + if parser_config.page_materialization_config is not None + else PageMaterializationConfig.from_decode_config(self._decode_config) + ) self._page_counts: Dict[str, int] = {} self._scheduled_page_counts: Dict[str, int] = {} @@ -1354,4 +1441,5 @@ def get_task(self) -> "PageParseResult": boundary_type=self._parser_config.boundary_type, render_config=self._parser_config.render_config, decode_config=self._decode_config, + materialization_config=self._materialization_config, ) diff --git a/perf/run_scaling.py b/perf/run_scaling.py index 723c7d32..99ea534e 100644 --- a/perf/run_scaling.py +++ b/perf/run_scaling.py @@ -23,6 +23,9 @@ python perf/run_scaling.py # HF default, render mode, pypdfium2 python perf/run_scaling.py ./pdfs --mode parse python perf/run_scaling.py --mode both --other "pypdfium2;pymupdf" + python perf/run_scaling.py ./pdfs --mode render --keep-char-cells=true \ + --create-word-cells=true --create-line-cells=true \ + --keep-shapes=true --keep-bitmaps=true """ from __future__ import annotations @@ -146,29 +149,101 @@ def apply_max_pages( # -------- Decode config helper -------- -def _decode_config(): +def _str_to_bool(value: str | bool) -> bool: + if isinstance(value, bool): + return value + normalized = value.strip().lower() + if normalized in {"1", "true", "t", "yes", "y", "on"}: + return True + if normalized in {"0", "false", "f", "no", "n", "off"}: + return False + raise argparse.ArgumentTypeError( + f"expected a boolean value, got {value!r}; use true or false" + ) + + +def _add_bool_value_arg( + ap: argparse.ArgumentParser, + name: str, + *, + default: bool, + help: str, +) -> None: + ap.add_argument( + f"--{name}", + type=_str_to_bool, + default=default, + metavar="{true,false}", + help=f"{help} (default: {str(default).lower()})", + ) + + +def _decode_options_from_args(args: argparse.Namespace) -> dict[str, bool]: + return { + "keep_char_cells": args.keep_char_cells, + "keep_shapes": args.keep_shapes, + "keep_bitmaps": args.keep_bitmaps, + "create_word_cells": args.create_word_cells, + "create_line_cells": args.create_line_cells, + } + + +def _materialization_options_from_args(args: argparse.Namespace) -> dict[str, bool]: + return { + "materialize_char_cells": args.materialize_char_cells, + "materialize_word_cells": args.materialize_word_cells, + "materialize_line_cells": args.materialize_line_cells, + "materialize_shapes": args.materialize_shapes, + "materialize_bitmaps": args.materialize_bitmaps, + "materialize_bitmap_bytes": args.materialize_bitmap_bytes, + } + + +def _materializes_page_data(materialization_options: dict[str, bool]) -> bool: + return any( + materialization_options[name] + for name in ( + "materialize_char_cells", + "materialize_word_cells", + "materialize_line_cells", + "materialize_shapes", + "materialize_bitmaps", + ) + ) + + +def _decode_config(decode_options: dict[str, bool]): from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import] c = DecodePageConfig() - c.keep_char_cells = True - c.keep_shapes = False - # c.keep_bitmaps = True - c.keep_bitmaps = False - c.materialize_bitmap_bytes = False - c.create_word_cells = False - # c.create_line_cells = True - c.create_line_cells = False + c.keep_char_cells = decode_options["keep_char_cells"] + c.keep_shapes = decode_options["keep_shapes"] + c.keep_bitmaps = decode_options["keep_bitmaps"] + c.create_word_cells = decode_options["create_word_cells"] + c.create_line_cells = decode_options["create_line_cells"] return c +def _materialization_config(materialization_options: dict[str, bool]): + from docling_parse.pdf_parser import PageMaterializationConfig + + return PageMaterializationConfig(**materialization_options) + + def _config_rows(config, fields: List[str]) -> List[List[str]]: return [[field, getattr(config, field)] for field in fields] -def _print_run_configs(*, render: bool, scale: float) -> None: +def _print_run_configs( + *, + render: bool, + scale: float, + decode_options: dict[str, bool], + materialization_options: dict[str, bool], +) -> None: from docling_parse.pdf_parsers import RenderConfig # type: ignore[import] - decode_config = _decode_config() + decode_config = _decode_config(decode_options) decode_fields = [ "page_boundary", "do_sanitization", @@ -188,7 +263,6 @@ def _print_run_configs(*, render: bool, scale: float) -> None: "release_native_memory_every_n_pages", "keep_glyphs", "keep_qpdf_warnings", - "materialize_bitmap_bytes", ] print("Decode config:") print( @@ -199,6 +273,24 @@ def _print_run_configs(*, render: bool, scale: float) -> None: ) print() + materialization_config = _materialization_config(materialization_options) + materialization_fields = [ + "materialize_char_cells", + "materialize_word_cells", + "materialize_line_cells", + "materialize_shapes", + "materialize_bitmaps", + "materialize_bitmap_bytes", + ] + print("Materialization config:") + print( + tabulate( + _config_rows(materialization_config, materialization_fields), + headers=["parameter", "value"], + ) + ) + print() + print("Render config:") if not render: print(tabulate([["enabled", False]], headers=["parameter", "value"])) @@ -274,12 +366,17 @@ def _timing_csv_row( # -------- Baselines -------- -def run_sequential_parse(pdf_schedule: List[Tuple[Path, List[int] | None]]) -> float: +def run_sequential_parse( + pdf_schedule: List[Tuple[Path, List[int] | None]], + decode_options: dict[str, bool], + materialization_options: dict[str, bool], +) -> float: """Sequential DoclingPdfParser decode (no render). Returns wall time in seconds.""" from docling_parse.pdf_parser import DoclingPdfParser - config = _decode_config() + config = _decode_config(decode_options) config.do_thread_safe = False # no need for isolated QPDF per page + materialization_config = _materialization_config(materialization_options) parser = DoclingPdfParser(loglevel="fatal") @@ -290,11 +387,18 @@ def run_sequential_parse(pdf_schedule: List[Tuple[Path, List[int] | None]]) -> f try: doc = parser.load(str(pdf_path), lazy=True) if page_numbers is None: - for _, _ in doc.iterate_pages(config=config): + for _, _ in doc.iterate_pages( + config=config, + materialization_config=materialization_config, + ): pass else: for page_number in page_numbers: - _ = doc.get_page(page_number, config=config) + _ = doc.get_page( + page_number, + config=config, + materialization_config=materialization_config, + ) doc.unload() except Exception as e: print(f" sequential error on {pdf_path}: {e}") @@ -532,17 +636,22 @@ def run_threaded( *, render: bool, scale: float, + decode_options: dict[str, bool], + materialization_options: dict[str, bool], enable_timing: bool, timing_csv: Path, ) -> float: """Run DoclingThreadedPdfParser; render=True enables rasterisation.""" from docling_parse.pdf_parser import ( DoclingThreadedPdfParser, + PageMaterializationConfig, ThreadedPdfParserConfig, ) from docling_parse.pdf_parsers import RenderConfig # type: ignore[import] - decode_config = _decode_config() + decode_config = _decode_config(decode_options) + materialization_config = PageMaterializationConfig(**materialization_options) + materialize_page = _materializes_page_data(materialization_options) render_config = None if render: @@ -554,6 +663,7 @@ def run_threaded( threads=num_threads, max_concurrent_results=max_concurrent_results, render_config=render_config, + page_materialization_config=materialization_config, ) parser = DoclingThreadedPdfParser( @@ -591,7 +701,8 @@ def run_threaded( if result.success: if render: img: PILImage = result.get_image() - page: SegmentedPdfPage = result.get_page() + if materialize_page: + page: SegmentedPdfPage = result.get_page() """ assert len(page.shapes)==0, "len(page.shapes)==0" @@ -601,7 +712,8 @@ def run_threaded( assert br.image==None """ else: - page = result.get_page() + if materialize_page: + page = result.get_page() else: errors += 1 @@ -700,6 +812,8 @@ def _run_one_mode( *, render: bool, scale: float, + decode_options: dict[str, bool], + materialization_options: dict[str, bool], enable_timing: bool, timing_csv: Path, ) -> Tuple[List[Tuple[str, float]], List[Tuple[int, float]]]: @@ -709,7 +823,11 @@ def _run_one_mode( # (DoclingPdfParser has no rendering path). if not render: print("Running sequential (DoclingPdfParser) ...") - t = run_sequential_parse(pdf_schedule) + t = run_sequential_parse( + pdf_schedule, + decode_options, + materialization_options, + ) print(f" sequential: {t:.3f}s") baselines.append(("sequential docling (1t)", t)) print() @@ -734,6 +852,8 @@ def _run_one_mode( total_pages=total_pages, render=render, scale=scale, + decode_options=decode_options, + materialization_options=materialization_options, enable_timing=enable_timing, timing_csv=timing_csv, ) @@ -786,6 +906,72 @@ def main(argv: List[str]) -> int: "--scale", type=float, default=1.0, help="Render scale for rendering (default: 1.0; render/both modes only)", ) + _add_bool_value_arg( + ap, + "keep-char-cells", + default=True, + help="Populate character cells and emit text render instructions", + ) + _add_bool_value_arg( + ap, + "create-word-cells", + default=False, + help="Create word cells during decoding", + ) + _add_bool_value_arg( + ap, + "create-line-cells", + default=False, + help="Create line cells during decoding", + ) + _add_bool_value_arg( + ap, + "keep-shapes", + default=False, + help="Keep vector shape cells", + ) + _add_bool_value_arg( + ap, + "keep-bitmaps", + default=False, + help="Keep bitmap resources/cells", + ) + _add_bool_value_arg( + ap, + "materialize-char-cells", + default=False, + help="Materialize character cells into SegmentedPdfPage", + ) + _add_bool_value_arg( + ap, + "materialize-word-cells", + default=False, + help="Materialize word cells into SegmentedPdfPage", + ) + _add_bool_value_arg( + ap, + "materialize-line-cells", + default=False, + help="Materialize line cells into SegmentedPdfPage", + ) + _add_bool_value_arg( + ap, + "materialize-shapes", + default=False, + help="Materialize vector shapes into SegmentedPdfPage", + ) + _add_bool_value_arg( + ap, + "materialize-bitmaps", + default=False, + help="Materialize bitmap locations into SegmentedPdfPage", + ) + _add_bool_value_arg( + ap, + "materialize-bitmap-bytes", + default=False, + help="Materialize bitmap image bytes when bitmap locations are materialized", + ) ap.add_argument( "--other", type=str, @@ -814,6 +1000,8 @@ def main(argv: List[str]) -> int: # Validate CLI args before doing any I/O (HF download, page counting). thread_counts = [int(x.strip()) for x in args.threads.split(",")] other_backends = parse_other_arg(args.other) + decode_options = _decode_options_from_args(args) + materialization_options = _materialization_options_from_args(args) pdfs = resolve_pdf_inputs(args.input, recursive=args.recursive) if not pdfs: @@ -833,7 +1021,12 @@ def main(argv: List[str]) -> int: if args.mode in ("render", "both"): print(f"Render scale: {args.scale}") print() - _print_run_configs(render=args.mode in ("render", "both"), scale=args.scale) + _print_run_configs( + render=args.mode in ("render", "both"), + scale=args.scale, + decode_options=decode_options, + materialization_options=materialization_options, + ) print() modes_to_run = ["parse", "render"] if args.mode == "both" else [args.mode] @@ -849,6 +1042,8 @@ def main(argv: List[str]) -> int: other_backends, render=render, scale=args.scale, + decode_options=decode_options, + materialization_options=materialization_options, enable_timing=args.enable_timing, timing_csv=args.timing_csv, ) diff --git a/src/parse/config.h b/src/parse/config.h index b7a576d2..b010a86e 100644 --- a/src/parse/config.h +++ b/src/parse/config.h @@ -43,9 +43,6 @@ namespace pdflib bool keep_glyphs = false; bool keep_qpdf_warnings = false; - // consumed by Python layer only; C++ ignores this field - bool materialize_bitmap_bytes = true; - nlohmann::json to_json() const; void from_json(const nlohmann::json& j); @@ -86,7 +83,6 @@ namespace pdflib j["keep_glyphs"] = keep_glyphs; j["keep_qpdf_warnings"] = keep_qpdf_warnings; - j["materialize_bitmap_bytes"] = materialize_bitmap_bytes; return j; } @@ -120,7 +116,6 @@ namespace pdflib if(j.count("keep_glyphs")) { keep_glyphs = j["keep_glyphs"]; } if(j.count("keep_qpdf_warnings")) { keep_qpdf_warnings = j["keep_qpdf_warnings"]; } - if(j.count("materialize_bitmap_bytes")) { materialize_bitmap_bytes = j["materialize_bitmap_bytes"]; } } bool decode_config::load(const std::string& filename) @@ -174,8 +169,7 @@ namespace pdflib << std::setw(48) << "populate_json_objects" << (populate_json_objects ? "true" : "false") << "\n" << std::setw(48) << "release_native_memory_every_n_pages" << release_native_memory_every_n_pages << "\n" << std::setw(48) << "keep_glyphs" << (keep_glyphs ? "true" : "false") << "\n" - << std::setw(48) << "keep_qpdf_warnings" << (keep_qpdf_warnings ? "true" : "false") << "\n" - << std::setw(48) << "materialize_bitmap_bytes" << (materialize_bitmap_bytes ? "true" : "false") << "\n"; + << std::setw(48) << "keep_qpdf_warnings" << (keep_qpdf_warnings ? "true" : "false") << "\n"; return ss.str(); } diff --git a/tests/test_parse.py b/tests/test_parse.py index 6aab041c..3b66d1ce 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -20,7 +20,12 @@ ) from pydantic import TypeAdapter -from docling_parse.pdf_parser import DecodePageConfig, DoclingPdfParser, PdfDocument +from docling_parse.pdf_parser import ( + DecodePageConfig, + DoclingPdfParser, + PageMaterializationConfig, + PdfDocument, +) GENERATE = False @@ -1032,18 +1037,20 @@ def _make_bitmap_config() -> DecodePageConfig: def test_bitmap_no_materialization_preserves_geometry(): - """bitmap_resources count and rects match regardless of materialize_bitmap_bytes.""" + """bitmap_resources count and rects match regardless of bitmap bytes.""" parser = DoclingPdfParser(loglevel="fatal") pdf_doc = parser.load(path_or_stream=BITMAP_PDF, lazy=True) - config_full = _make_bitmap_config() - config_full.materialize_bitmap_bytes = True - - config_geo = _make_bitmap_config() - config_geo.materialize_bitmap_bytes = False + config = _make_bitmap_config() + materialize_full = PageMaterializationConfig(materialize_bitmap_bytes=True) + materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) - page_full = pdf_doc.get_page(1, config=config_full) - page_geo = pdf_doc.get_page(1, config=config_geo) + page_full = pdf_doc.get_page( + 1, config=config, materialization_config=materialize_full + ) + page_geo = pdf_doc.get_page( + 1, config=config, materialization_config=materialize_geo + ) assert len(page_full.bitmap_resources) == len(page_geo.bitmap_resources), ( "bitmap count must match between full and geometry-only modes" @@ -1068,24 +1075,24 @@ def test_bitmap_no_materialization_preserves_geometry(): def test_bitmap_no_materialization_has_no_image(): - """materialize_bitmap_bytes=False produces placeholder resources with image=None.""" + """materialize_bitmap_bytes=False produces placeholders with image=None.""" from docling_core.types.doc.base import ImageRefMode parser = DoclingPdfParser(loglevel="fatal") pdf_doc = parser.load(path_or_stream=BITMAP_PDF, lazy=True) config = _make_bitmap_config() - config.materialize_bitmap_bytes = False + materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) - page = pdf_doc.get_page(1, config=config) + page = pdf_doc.get_page(1, config=config, materialization_config=materialize_geo) assert len(page.bitmap_resources) > 0, "test PDF must contain bitmaps" for bm in page.bitmap_resources: assert bm.image is None, ( - "image must be None when materialize_bitmap_bytes=False" + "image must be None when bitmap bytes are not materialized" ) assert bm.mode == ImageRefMode.PLACEHOLDER, ( - "mode must be PLACEHOLDER when materialize_bitmap_bytes=False" + "mode must be PLACEHOLDER when bitmap bytes are not materialized" ) @@ -1096,21 +1103,23 @@ def test_bitmap_materialization_cache_false_then_true(): parser = DoclingPdfParser(loglevel="fatal") pdf_doc = parser.load(path_or_stream=BITMAP_PDF, lazy=True) - config_geo = _make_bitmap_config() - config_geo.materialize_bitmap_bytes = False - - config_full = _make_bitmap_config() - config_full.materialize_bitmap_bytes = True + config = _make_bitmap_config() + materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) + materialize_full = PageMaterializationConfig(materialize_bitmap_bytes=True) - page_geo = pdf_doc.get_page(1, config=config_geo) - page_full = pdf_doc.get_page(1, config=config_full) + page_geo = pdf_doc.get_page( + 1, config=config, materialization_config=materialize_geo + ) + page_full = pdf_doc.get_page( + 1, config=config, materialization_config=materialize_full + ) for bm in page_geo.bitmap_resources: assert bm.image is None assert bm.mode == ImageRefMode.PLACEHOLDER assert any(bm.image is not None for bm in page_full.bitmap_resources), ( - "at least one bitmap should be embedded when materialize_bitmap_bytes=True" + "at least one bitmap should be embedded when bitmap bytes are materialized" ) pdf_doc.unload() @@ -1123,17 +1132,19 @@ def test_bitmap_materialization_cache_true_then_false(): parser = DoclingPdfParser(loglevel="fatal") pdf_doc = parser.load(path_or_stream=BITMAP_PDF, lazy=True) - config_full = _make_bitmap_config() - config_full.materialize_bitmap_bytes = True - - config_geo = _make_bitmap_config() - config_geo.materialize_bitmap_bytes = False + config = _make_bitmap_config() + materialize_full = PageMaterializationConfig(materialize_bitmap_bytes=True) + materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) - page_full = pdf_doc.get_page(1, config=config_full) - page_geo = pdf_doc.get_page(1, config=config_geo) + page_full = pdf_doc.get_page( + 1, config=config, materialization_config=materialize_full + ) + page_geo = pdf_doc.get_page( + 1, config=config, materialization_config=materialize_geo + ) assert any(bm.image is not None for bm in page_full.bitmap_resources), ( - "at least one bitmap should be embedded when materialize_bitmap_bytes=True" + "at least one bitmap should be embedded when bitmap bytes are materialized" ) for bm in page_geo.bitmap_resources: diff --git a/tests/test_threaded_parse.py b/tests/test_threaded_parse.py index befb9c20..0fb98e98 100644 --- a/tests/test_threaded_parse.py +++ b/tests/test_threaded_parse.py @@ -12,6 +12,7 @@ DecodePageConfig, DoclingPdfParser, DoclingThreadedPdfParser, + PageMaterializationConfig, ThreadedPdfParserConfig, ) from tests.test_parse import ( @@ -327,26 +328,26 @@ def test_threaded_bitmap_no_materialization_preserves_geometry(): """Threaded path: geometry matches between full and placeholder-only modes.""" from docling_core.types.doc.base import ImageRefMode - config_full = _make_bitmap_config() - config_full.materialize_bitmap_bytes = True + config = _make_bitmap_config() + materialize_full = PageMaterializationConfig(materialize_bitmap_bytes=True) + materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) - config_geo = _make_bitmap_config() - config_geo.materialize_bitmap_bytes = False - - def _get_page1(decode_config: DecodePageConfig) -> "SegmentedPdfPage": + def _get_page1( + materialization_config: PageMaterializationConfig, + ) -> "SegmentedPdfPage": parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), - decode_config=decode_config, + decode_config=config, ) parser.load(BITMAP_PDF) return next( - r.get_page() + r.get_page(materialization_config) for r in parser.iterate_results() if r.success and r.page_number == 1 ) - page_full = _get_page1(config_full) - page_geo = _get_page1(config_geo) + page_full = _get_page1(materialize_full) + page_geo = _get_page1(materialize_geo) assert len(page_full.bitmap_resources) > 0, "test PDF must contain bitmaps" assert len(page_full.bitmap_resources) == len(page_geo.bitmap_resources) From fef9d8d58e52e4d04e36ec84397c94c61c0d48d0 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Fri, 29 May 2026 17:54:28 +0200 Subject: [PATCH 4/9] ran pre-commit Signed-off-by: Peter Staar --- docling_parse/pdf_parser.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index 9cc5a66b..e7bcc9d4 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -746,7 +746,9 @@ def get_page( if config is None: config = self._default_config() if materialization_config is None: - materialization_config = PageMaterializationConfig.from_decode_config(config) + materialization_config = PageMaterializationConfig.from_decode_config( + config + ) return self._get_page_typed( page_no, config=config, materialization_config=materialization_config ) @@ -775,7 +777,9 @@ def get_page_with_timings( if config is None: config = self._default_config() if materialization_config is None: - materialization_config = PageMaterializationConfig.from_decode_config(config) + materialization_config = PageMaterializationConfig.from_decode_config( + config + ) if not (1 <= page_no <= self.number_of_pages()): raise ValueError( @@ -824,7 +828,9 @@ def load_all_pages( if config is None: config = self._default_config() if materialization_config is None: - materialization_config = PageMaterializationConfig.from_decode_config(config) + materialization_config = PageMaterializationConfig.from_decode_config( + config + ) for page_no in range(1, self.number_of_pages() + 1): self.get_page( page_no, From 418b860cd80139d2282f046272e98679e809498d Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Sun, 31 May 2026 15:30:07 +0200 Subject: [PATCH 5/9] refactored the font resolver Signed-off-by: Peter Staar --- app/run_scaling.cpp | 58 +++- src/pybind/docling_threaded_renderer.h | 28 +- src/render/blend2d_font_resolver.h | 429 +++++++++++++++++++++++++ src/render/blend2d_renderer.h | 339 +++---------------- 4 files changed, 533 insertions(+), 321 deletions(-) create mode 100644 src/render/blend2d_font_resolver.h diff --git a/app/run_scaling.cpp b/app/run_scaling.cpp index 6468a3cd..025578b7 100644 --- a/app/run_scaling.cpp +++ b/app/run_scaling.cpp @@ -24,6 +24,8 @@ #include #include +#include + namespace { using clock_type = std::chrono::steady_clock; @@ -232,6 +234,16 @@ namespace } private: + static int terminal_width() + { + struct winsize size {}; + if(ioctl(2, TIOCGWINSZ, &size) == 0 and size.ws_col > 0) + { + return static_cast(size.ws_col); + } + return 100; + } + void draw(int current, bool force) { if(total_ <= 0 and not force) @@ -239,10 +251,8 @@ namespace return; } - const int width = 40; const double fraction = total_ > 0 ? static_cast(current) / static_cast(total_) : 1.0; - const int filled = std::min(width, static_cast(fraction * width)); const double elapsed = std::chrono::duration(clock_type::now() - start_).count(); const double rate = elapsed > 0.0 @@ -251,19 +261,32 @@ namespace ? elapsed * (static_cast(total_) / static_cast(current)) : 0.0; - std::cerr << "\r" << label_ << ": ["; + std::ostringstream suffix; + suffix << "] " + << current << "/" << total_ << " " + << std::fixed << std::setprecision(1) << (fraction * 100.0) + << "% " + << std::fixed << std::setprecision(1) << rate << "/s " + << "elapsed: " << format_duration(elapsed) << " [sec]" + << " total: " << format_duration(total) << " [sec]"; + const std::string suffix_text = suffix.str(); + + const std::string prefix = label_ + ": ["; + const int available = terminal_width() + - static_cast(prefix.size()) + - static_cast(suffix_text.size()); + const int width = std::max(0, std::min(40, available)); + const int filled = std::min(width, static_cast(fraction * width)); + + std::ostringstream line; + line << prefix; for(int i = 0; i < width; ++i) { - std::cerr << (i < filled ? '#' : '-'); + line << (i < filled ? '#' : '-'); } - std::cerr << "] " - << current << "/" << total_ << " " - << std::fixed << std::setprecision(1) << (fraction * 100.0) - << "% " - << std::fixed << std::setprecision(1) << rate << "/s " - << "elapsed: " << format_duration(elapsed) << " [sec]" - << " total: " << format_duration(total) << " [sec]" - << " " << std::flush; + line << suffix_text; + + std::cerr << "\r\033[K" << line.str() << std::flush; } private: @@ -548,7 +571,13 @@ namespace max_concurrent_results_(max_concurrent_results), decode_config_(decode_config), render_config_(render_config) - {} + { + if(render_config_.has_value()) + { + font_resolver_ = std::make_shared(); + font_resolver_->warm(); + } + } benchmark_result run(const std::string& mode, bool enable_timing, @@ -647,7 +676,7 @@ namespace if(render_config_.has_value()) { stage_start = clock_type::now(); - pdflib::renderer rnd(*render_config_); + pdflib::renderer rnd(*render_config_, font_resolver_); page_decoder->get_instructions().iterate_over_instructions(rnd); result.timings.render_page_s = std::chrono::duration(clock_type::now() - stage_start).count(); @@ -714,6 +743,7 @@ namespace int max_concurrent_results_; pdflib::decode_config decode_config_; std::optional render_config_; + std::shared_ptr font_resolver_; std::vector tasks_; std::atomic next_task_{0}; diff --git a/src/pybind/docling_threaded_renderer.h b/src/pybind/docling_threaded_renderer.h index 952f7d59..a0fcb19d 100644 --- a/src/pybind/docling_threaded_renderer.h +++ b/src/pybind/docling_threaded_renderer.h @@ -21,21 +21,33 @@ namespace docling int num_threads, int max_concurrent_results, pdflib::decode_config decode_config, - pdflib::render_config render_config): - docling_threaded_base(loglevel, - num_threads, - max_concurrent_results, - decode_config), - render_cfg(render_config) - {} + pdflib::render_config render_config); void worker_loop(); private: pdflib::render_config render_cfg; + + // Shared across workers; pages keep only their tiny local alias cache. + std::shared_ptr font_resolver_; }; + inline docling_threaded_renderer::docling_threaded_renderer(std::string loglevel, + int num_threads, + int max_concurrent_results, + pdflib::decode_config decode_config, + pdflib::render_config render_config): + docling_threaded_base(loglevel, + num_threads, + max_concurrent_results, + decode_config), + render_cfg(render_config), + font_resolver_(std::make_shared()) + { + font_resolver_->warm(); + } + inline void docling_threaded_renderer::worker_loop() { using clock_type = std::chrono::steady_clock; @@ -103,7 +115,7 @@ namespace docling } stage_start = clock_type::now(); - pdflib::renderer rnd(render_cfg); + pdflib::renderer rnd(render_cfg, font_resolver_); page_decoder->get_instructions().iterate_over_instructions(rnd); result.timings.render_page_s = std::chrono::duration(clock_type::now() - stage_start).count(); diff --git a/src/render/blend2d_font_resolver.h b/src/render/blend2d_font_resolver.h new file mode 100644 index 00000000..488d7bf2 --- /dev/null +++ b/src/render/blend2d_font_resolver.h @@ -0,0 +1,429 @@ +//-*-C++-*- + +#ifndef PDF_BLEND2D_FONT_RESOLVER_H +#define PDF_BLEND2D_FONT_RESOLVER_H + +#include + +#ifndef LOGURU_WITH_STREAMS +#define LOGURU_WITH_STREAMS 1 +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace pdflib +{ + class blend2d_font_resolver + { + public: + blend2d_font_resolver(); + + static std::shared_ptr default_resolver(); + + void warm(); + + BLFontFace resolve_font_face(const std::string& font_name, + const std::string& base_font, + bool resolve_fonts, + float font_similarity_cutoff); + + private: + + struct match_cache_key + { + std::string normalized_query; + int cutoff_x10000 = 0; + + bool operator==(const match_cache_key& other) const; + }; + + struct match_cache_key_hash + { + std::size_t operator()(const match_cache_key& key) const; + }; + + static std::string normalize_font_name(const std::string& name); + static std::vector split_tokens(const std::string& s); + static bool is_style_token(const std::string& tok); + static std::vector significant_tokens(const std::vector& toks); + static bool vectors_equal(const std::vector& lhs, + const std::vector& rhs); + static int quantized_cutoff(float cutoff); + static std::optional fallback_font_path(); + + void build_font_index(); + std::optional resolve_font_path(const std::string& cache_key, + float font_similarity_cutoff); + std::optional fuzzy_find_font(const std::string& norm_query, + float font_similarity_cutoff) const; + BLFontFace load_font_face(const std::string& path); + + std::once_flag index_once_; + std::unordered_map font_index_; + + mutable std::shared_mutex match_cache_mutex_; + std::unordered_map, + match_cache_key_hash> match_cache_; + + mutable std::shared_mutex face_cache_mutex_; + std::unordered_map face_cache_; + }; + + inline blend2d_font_resolver::blend2d_font_resolver() = default; + + inline std::shared_ptr blend2d_font_resolver::default_resolver() + { + static std::shared_ptr resolver = []() + { + auto shared = std::make_shared(); + shared->warm(); + return shared; + }(); + + return resolver; + } + + inline void blend2d_font_resolver::warm() + { + std::call_once(index_once_, [this]() { build_font_index(); }); + } + + inline BLFontFace blend2d_font_resolver::resolve_font_face( + const std::string& font_name, + const std::string& base_font, + bool resolve_fonts, + float font_similarity_cutoff) + { + const std::string& cache_key = (not font_name.empty() and font_name != "null") + ? font_name : base_font; + + std::optional font_path; + if (resolve_fonts) + { + font_path = resolve_font_path(cache_key, font_similarity_cutoff); + } + + if (not font_path.has_value() or font_path->empty()) + { + font_path = fallback_font_path(); + } + + if (not font_path.has_value() or font_path->empty()) + { + return {}; + } + + return load_font_face(*font_path); + } + + inline bool blend2d_font_resolver::match_cache_key::operator==( + const match_cache_key& other) const + { + return normalized_query == other.normalized_query and + cutoff_x10000 == other.cutoff_x10000; + } + + inline std::size_t blend2d_font_resolver::match_cache_key_hash::operator()( + const match_cache_key& key) const + { + const std::size_t h1 = std::hash{}(key.normalized_query); + const std::size_t h2 = std::hash{}(key.cutoff_x10000); + return h1 ^ (h2 + 0x9e3779b97f4a7c15ULL + (h1 << 6) + (h1 >> 2)); + } + + inline std::string blend2d_font_resolver::normalize_font_name(const std::string& name) + { + std::string s = name; + if (not s.empty() and s[0] == '/') { s = s.substr(1); } + + if (s.size() > 7 and s[6] == '+' and + std::all_of(s.begin(), s.begin() + 6, + [](char c){ return std::isupper(static_cast(c)); })) + { + s = s.substr(7); + } + + std::replace(s.begin(), s.end(), '-', ' '); + + std::string expanded; + for (size_t i = 0; i < s.size(); ++i) + { + if (i > 0 + and std::isupper(static_cast(s[i])) + and std::islower(static_cast(s[i - 1]))) + { + expanded += ' '; + } + expanded += static_cast( + std::tolower(static_cast(s[i]))); + } + + for (const auto& suf : {" psmt", " ps", " mt"}) + { + const std::string sfx(suf); + if (expanded.size() >= sfx.size() and + expanded.compare(expanded.size() - sfx.size(), + sfx.size(), sfx) == 0) + { + expanded.resize(expanded.size() - sfx.size()); + break; + } + } + + while (not expanded.empty() and expanded.back() == ' ') + { + expanded.pop_back(); + } + + return expanded; + } + + inline std::vector blend2d_font_resolver::split_tokens(const std::string& s) + { + std::vector toks; + std::istringstream iss(s); + std::string tok; + while (iss >> tok) { toks.push_back(tok); } + return toks; + } + + inline bool blend2d_font_resolver::is_style_token(const std::string& tok) + { + static const std::array kStyleTokens = { + "regular", "normal", "roman", "book", "medium", + "bold", "italic", "oblique", "light", "thin", "black" + }; + return std::find(kStyleTokens.begin(), kStyleTokens.end(), tok) != kStyleTokens.end(); + } + + inline std::vector blend2d_font_resolver::significant_tokens( + const std::vector& toks) + { + std::vector out; + for (const auto& tok : toks) + { + if (not is_style_token(tok)) + { + out.push_back(tok); + } + } + return out; + } + + inline bool blend2d_font_resolver::vectors_equal(const std::vector& lhs, + const std::vector& rhs) + { + if (lhs.size() != rhs.size()) { return false; } + for (size_t i = 0; i < lhs.size(); ++i) + { + if (lhs[i] != rhs[i]) + { + return false; + } + } + return true; + } + + inline int blend2d_font_resolver::quantized_cutoff(float cutoff) + { + return static_cast(std::lround(cutoff * 10000.0f)); + } + + inline std::optional blend2d_font_resolver::fallback_font_path() + { + namespace fs = std::filesystem; + for (const auto& fallback : { + "/System/Library/Fonts/Helvetica.ttc", + "/System/Library/Fonts/Arial.ttf", + "/Library/Fonts/Arial.ttf", + }) + { + if (fs::exists(fallback)) + { + return std::string(fallback); + } + } + + return std::nullopt; + } + + inline void blend2d_font_resolver::build_font_index() + { + namespace fs = std::filesystem; + const std::vector font_dirs = { + "/System/Library/Fonts", + "/System/Library/Fonts/Supplemental", + "/Library/Fonts", + }; + + LOG_S(INFO) << "blend2d font resolver: scanning font directories"; + for (const auto& dir : font_dirs) + { + LOG_S(INFO) << "blend2d font resolver: font directory: " << dir; + } + + for (const auto& dir : font_dirs) + { + if (not fs::is_directory(dir)) + { + LOG_S(INFO) << "blend2d font resolver: skipping missing font directory: " << dir; + continue; + } + + for (const auto& entry : fs::directory_iterator(dir)) + { + const auto& p = entry.path(); + const std::string ext = p.extension().string(); + if (ext != ".ttf" and ext != ".otf" and ext != ".ttc") { continue; } + + const std::string norm = normalize_font_name(p.stem().string()); + if (font_index_.find(norm) == font_index_.end()) + { + font_index_[norm] = p.string(); + LOG_S(INFO) << "blend2d font resolver: indexed font: " + << norm << " -> " << p.string(); + } + } + } + + LOG_S(INFO) << "blend2d font resolver: indexed " + << font_index_.size() << " fonts"; + } + + inline std::optional blend2d_font_resolver::resolve_font_path( + const std::string& cache_key, + float font_similarity_cutoff) + { + warm(); + + const match_cache_key match_key{ + normalize_font_name(cache_key), + quantized_cutoff(font_similarity_cutoff) + }; + + { + std::shared_lock lock(match_cache_mutex_); + auto itr = match_cache_.find(match_key); + if (itr != match_cache_.end()) + { + return itr->second; + } + } + + std::optional found_path; + + auto exact = font_index_.find(match_key.normalized_query); + if (exact != font_index_.end()) + { + found_path = exact->second; + } + else + { + found_path = fuzzy_find_font(match_key.normalized_query, + font_similarity_cutoff); + } + + { + std::unique_lock lock(match_cache_mutex_); + auto [itr, inserted] = match_cache_.emplace(match_key, found_path); + return itr->second; + } + } + + inline std::optional blend2d_font_resolver::fuzzy_find_font( + const std::string& norm_query, + float font_similarity_cutoff) const + { + const auto q_toks = split_tokens(norm_query); + if (q_toks.empty()) { return std::nullopt; } + + const auto q_sig_toks = significant_tokens(q_toks); + if (q_sig_toks.empty()) { return std::nullopt; } + + std::optional best_path; + float best_jaccard = 0.0f; + int best_size_delta = INT_MAX; + + for (const auto& [norm_name, path] : font_index_) + { + const auto c_toks = split_tokens(norm_name); + const auto c_sig_toks = significant_tokens(c_toks); + if (not vectors_equal(c_sig_toks, q_sig_toks)) { continue; } + + int score = 0; + const auto max_tokens = std::min(q_toks.size(), c_toks.size()); + for (size_t i = 0; i < max_tokens; ++i) + { + if (q_toks[i] == c_toks[i]) + { + ++score; + } + } + + if (score == 0) { continue; } + + const float jaccard = static_cast(score) / + static_cast(q_toks.size() + c_toks.size() - score); + if (jaccard < font_similarity_cutoff) { continue; } + + const int delta = std::abs(static_cast(c_toks.size()) - + static_cast(q_toks.size())); + if (jaccard > best_jaccard or + (jaccard == best_jaccard and delta < best_size_delta)) + { + best_jaccard = jaccard; + best_size_delta = delta; + best_path = path; + } + } + + return best_path; + } + + inline BLFontFace blend2d_font_resolver::load_font_face(const std::string& path) + { + { + std::shared_lock lock(face_cache_mutex_); + auto itr = face_cache_.find(path); + if (itr != face_cache_.end()) + { + return itr->second; + } + } + + BLFontData data; + BLFontFace face; + const BLResult data_res = data.create_from_file( + path.c_str(), + static_cast(BL_FILE_READ_MMAP_ENABLED | + BL_FILE_READ_MMAP_AVOID_SMALL)); + if (data_res == BL_SUCCESS) + { + face.create_from_data(data, 0); + } + + { + std::unique_lock lock(face_cache_mutex_); + auto [itr, inserted] = face_cache_.emplace(path, face); + return itr->second; + } + } +} + +#endif diff --git a/src/render/blend2d_renderer.h b/src/render/blend2d_renderer.h index ad2a1cee..60c54bc9 100644 --- a/src/render/blend2d_renderer.h +++ b/src/render/blend2d_renderer.h @@ -5,22 +5,20 @@ #include #include +#include #include #include #include #include -#include #include #include -#include #include #include #include -#include #include -#include +#include namespace pdflib { @@ -31,6 +29,8 @@ namespace pdflib renderer(); explicit renderer(render_config config); + explicit renderer(render_config config, + std::shared_ptr font_resolver); void set_size(size_instruction& instr); void render_text(text_instruction& instr); @@ -74,15 +74,8 @@ namespace pdflib double origin_x_ = 0.0; // crop_bbox x origin (pdf units) double origin_y_ = 0.0; // crop_bbox y origin (pdf units, y-up) - // Lazily-built map from normalized font stem (e.g. "times new roman bold") - // to its absolute file path. - std::unordered_map font_index_; - - // Cache: normalized PDF font name → best-matched font file path. - std::unordered_map match_cache_; - - // Cache: cache_key → loaded BLFontFace. - std::unordered_map font_cache_; + std::shared_ptr font_resolver_; + std::unordered_map local_font_cache_; BLContext& page_context(); void finish_page_context() const; @@ -95,58 +88,6 @@ namespace pdflib return static_cast(shape_[0]) - (pdf_y - origin_y_) * scale_y_; } - // Normalize a font name for fuzzy comparison: - // 1. strip leading '/' - // 2. strip subset prefix like "ABCDEF+" - // 3. replace '-' with ' ' - // 4. insert spaces at camelCase boundaries - // 5. lowercase - // 6. strip PDF/PS suffixes (psmt, ps, mt) - static std::string normalize_font_name(const std::string& name) - { - std::string s = name; - if (not s.empty() and s[0] == '/') { s = s.substr(1); } - // strip 6-letter uppercase subset prefix, e.g. "ABCDEF+" - if (s.size() > 7 and s[6] == '+' and - std::all_of(s.begin(), s.begin() + 6, - [](char c){ return std::isupper(static_cast(c)); })) - { - s = s.substr(7); - } - std::replace(s.begin(), s.end(), '-', ' '); - // split camelCase: insert space before uppercase that follows lowercase - std::string expanded; - for (size_t i = 0; i < s.size(); ++i) - { - if (i > 0 - and std::isupper(static_cast(s[i])) - and std::islower(static_cast(s[i - 1]))) - { - expanded += ' '; - } - expanded += static_cast( - std::tolower(static_cast(s[i]))); - } - // strip known PS/PDF suffixes at the end - for (const auto& suf : {" psmt", " ps", " mt"}) - { - const std::string sfx(suf); - if (expanded.size() >= sfx.size() and - expanded.compare(expanded.size() - sfx.size(), - sfx.size(), sfx) == 0) - { - expanded.resize(expanded.size() - sfx.size()); - break; - } - } - // trim trailing spaces - while (not expanded.empty() and expanded.back() == ' ') - { - expanded.pop_back(); - } - return expanded; - } - static bool should_fit_glyph_bbox_to_target(const std::string& text) { if (text.empty()) { return false; } @@ -181,141 +122,10 @@ namespace pdflib return false; } - // Build font_index_ by scanning standard system font directories. - void build_font_index() - { - if (not font_index_.empty()) { return; } - namespace fs = std::filesystem; - const std::vector font_dirs = { - "/System/Library/Fonts", - "/System/Library/Fonts/Supplemental", - "/Library/Fonts", - }; - for (const auto& dir : font_dirs) - { - if (not fs::is_directory(dir)) { continue; } - for (const auto& entry : fs::directory_iterator(dir)) - { - const auto& p = entry.path(); - const std::string ext = p.extension().string(); - if (ext != ".ttf" and ext != ".otf" and ext != ".ttc") { continue; } - const std::string stem = p.stem().string(); - const std::string norm = normalize_font_name(stem); - // first entry wins (earlier dirs take priority) - if (font_index_.find(norm) == font_index_.end()) - { - font_index_[norm] = p.string(); - } - } - } - LOG_S(INFO) << "blend2d: font index built with " - << font_index_.size() << " entries"; - } - - // Find the best-matching font file path for the given normalized query. - // Uses token overlap (Jaccard-style): score = |query_tokens ∩ cand_tokens|. - // Returns empty string if nothing scores > 0. - std::string fuzzy_find_font(const std::string& norm_query) - { - auto is_style_token = [](const std::string& tok) -> bool - { - static const std::array kStyleTokens = { - "regular", "normal", "roman", "book", "medium", - "bold", "italic", "oblique", "light", "thin", "black" - }; - return std::find(kStyleTokens.begin(), kStyleTokens.end(), tok) != kStyleTokens.end(); - }; - auto significant_tokens = [&](const std::vector& toks) -> std::vector - { - std::vector out; - for (const auto& tok : toks) - { - if (is_style_token(tok)) - { - continue; - } - out.push_back(tok); - } - return out; - }; - auto vectors_equal = [](const std::vector& lhs, - const std::vector& rhs) -> bool - { - if (lhs.size() != rhs.size()) { return false; } - for (size_t i = 0; i < lhs.size(); ++i) - { - if (lhs[i] != rhs[i]) - { - return false; - } - } - return true; - }; - - auto split_tokens = [](const std::string& s) -> std::vector - { - std::vector toks; - std::istringstream iss(s); - std::string tok; - while (iss >> tok) { toks.push_back(tok); } - return toks; - }; - - const auto q_toks = split_tokens(norm_query); - if (q_toks.empty()) { return {}; } - const auto q_sig_toks = significant_tokens(q_toks); - if (q_sig_toks.empty()) { return {}; } - - // Only accept candidates with the exact same significant family/script - // tokens. This rejects cross-script variants like: - // "noto sans" -> "noto sans mongolian" - // "noto sans jp" -> "noto sans mongolian" - - // Minimum Jaccard similarity required to accept a fuzzy match. - // A raw intersection score of 1 on "regular" alone yields J ≈ 0.14 - // (1 shared token out of 7 in the union), which is too low and causes - // wrong fonts (e.g. NotoSansMongolian for ShinMGoPr6N) to be selected. - const float kMinJaccard = config_.font_similarity_cutoff; - - std::string best_path; - float best_jaccard = 0.0f; - int best_size_delta = INT_MAX; - - for (const auto& [norm_name, path] : font_index_) - { - const auto c_toks = split_tokens(norm_name); - const auto c_sig_toks = significant_tokens(c_toks); - if (not vectors_equal(c_sig_toks, q_sig_toks)) { continue; } - int score = 0; - const auto max_tokens = std::min(q_toks.size(), c_toks.size()); - for (size_t i = 0; i < max_tokens; ++i) - { - if (q_toks[i] == c_toks[i]) - { - ++score; - } - } - if (score == 0) { continue; } - const float jaccard = static_cast(score) / - static_cast(q_toks.size() + c_toks.size() - score); - if (jaccard < kMinJaccard) { continue; } - const int delta = std::abs(static_cast(c_toks.size()) - - static_cast(q_toks.size())); - if (jaccard > best_jaccard or (jaccard == best_jaccard and delta < best_size_delta)) - { - best_jaccard = jaccard; - best_size_delta = delta; - best_path = path; - } - } - - return best_path; - } - // Return a BLFontFace for the given PDF font names, falling back to a // system font if none can be resolved. Results are cached. - BLFontFace& resolve_font_face(const std::string& font_name, - const std::string& base_font); + BLFontFace resolve_font_face(const std::string& font_name, + const std::string& base_font); static bool nearly_equal(double a, double b, double eps = 1e-6) { @@ -441,11 +251,22 @@ namespace pdflib // --------------------------------------------------------------------------- inline renderer::renderer() - : shape_({0, 0, 4}) + : shape_({0, 0, 4}), + font_resolver_(blend2d_font_resolver::default_resolver()) {} inline renderer::renderer(render_config config) - : config_(config), shape_({0, 0, 4}) + : config_(config), + shape_({0, 0, 4}), + font_resolver_(blend2d_font_resolver::default_resolver()) + {} + + inline renderer::renderer(render_config config, + std::shared_ptr font_resolver) + : config_(config), + shape_({0, 0, 4}), + font_resolver_(font_resolver ? std::move(font_resolver) + : blend2d_font_resolver::default_resolver()) {} inline BLContext& renderer::page_context() @@ -539,109 +360,29 @@ namespace pdflib // --------------------------------------------------------------------------- // resolve_font_face // - // Resolves the best-matching system font file for the given PDF font names - // and returns a cached BLFontFace. - // - // When config_.resolve_fonts is true the full lookup pipeline runs: - // 1. Build the font index on first call (scan system font dirs). - // 2. Normalize the PDF font name and check the match_cache_. - // 3. Try an exact match in the font_index_. - // 4. Fall back to token-overlap fuzzy matching. - // When config_.resolve_fonts is false the hardcoded fallback is used directly. + // Resolves a BLFontFace through the shared resolver and keeps only a small + // per-page alias cache in the renderer hot path. // --------------------------------------------------------------------------- - inline BLFontFace& renderer::resolve_font_face( + inline BLFontFace renderer::resolve_font_face( const std::string& font_name, const std::string& base_font) { - // BLFontFace cache key: prefer font_name, fall back to base_font. const std::string& cache_key = (not font_name.empty() and font_name != "null") ? font_name : base_font; - auto it = font_cache_.find(cache_key); - if (it != font_cache_.end()) + auto itr = local_font_cache_.find(cache_key); + if (itr != local_font_cache_.end()) { - return it->second; + return itr->second; } - namespace fs = std::filesystem; - std::string found_path; - - if (config_.resolve_fonts) - { - build_font_index(); - - // Normalize each candidate name and check match_cache_ first. - const std::string norm_query = normalize_font_name(cache_key); - - auto mc = match_cache_.find(norm_query); - if (mc != match_cache_.end()) - { - found_path = mc->second; - } - else - { - // 1. Exact match in the index. - auto ei = font_index_.find(norm_query); - if (ei != font_index_.end()) - { - found_path = ei->second; - LOG_S(INFO) << "blend2d: exact font match '" - << norm_query << "' → '" << found_path << "'"; - } - else - { - // 2. Fuzzy token-overlap match. - found_path = fuzzy_find_font(norm_query); - if (not found_path.empty()) - { - LOG_S(INFO) << "blend2d: fuzzy font match '" - << norm_query << "' → '" << found_path << "'"; - } - } - match_cache_[norm_query] = found_path; - } - } - - // Hard-coded fallback (always present on macOS). - if (found_path.empty()) - { - for (const auto& fallback : { - "/System/Library/Fonts/Helvetica.ttc", - "/System/Library/Fonts/Arial.ttf", - "/Library/Fonts/Arial.ttf", - }) - { - if (fs::exists(fallback)) - { - found_path = fallback; - break; - } - } - if (config_.resolve_fonts) - { - LOG_S(WARNING) << "blend2d: no font match for '" - << cache_key << "', using fallback '" << found_path << "'"; - } - } - - BLFontFace face; - if (not found_path.empty()) - { - const BLResult res = face.create_from_file(found_path.c_str()); - if (res != BL_SUCCESS) - { - LOG_S(WARNING) << "blend2d: failed to load '" << found_path - << "' (BLResult=" << res << ")"; - } - } - else - { - LOG_S(WARNING) << "blend2d: no font file found for '" << cache_key << "'"; - } - - font_cache_.emplace(cache_key, std::move(face)); - return font_cache_.at(cache_key); + BLFontFace face = font_resolver_->resolve_font_face(cache_key, + base_font, + config_.resolve_fonts, + config_.font_similarity_cutoff); + auto [inserted_itr, inserted] = local_font_cache_.emplace(cache_key, face); + return inserted_itr->second; } // --------------------------------------------------------------------------- @@ -670,8 +411,8 @@ namespace pdflib const double quad_h = std::abs(y3-y0); const double size = ((quad_h > 0.5) ? quad_h : instr.get_font_size()) * scale_y_; - BLFontFace& face = resolve_font_face(instr.get_font_name(), - instr.get_base_font()); + BLFontFace face = resolve_font_face(instr.get_font_name(), + instr.get_base_font()); LOG_S(INFO) << "face: " << face.is_valid(); @@ -771,8 +512,8 @@ namespace pdflib << " cell_span=" << cell_span << " em_size=" << em_size << " size=" << size; - BLFontFace& face = resolve_font_face(instr.get_font_name(), - instr.get_base_font()); + BLFontFace face = resolve_font_face(instr.get_font_name(), + instr.get_base_font()); LOG_S(INFO) << "face valid=" << face.is_valid() << " font_name=`" << instr.get_font_name() << "`" << " base_font=`" << instr.get_base_font() << "`"; @@ -913,8 +654,8 @@ namespace pdflib << " cell_span=" << cell_span << " em_size=" << em_size << " size=" << size; - BLFontFace& face = resolve_font_face(instr.get_font_name(), - instr.get_base_font()); + BLFontFace face = resolve_font_face(instr.get_font_name(), + instr.get_base_font()); LOG_S(INFO) << "face valid=" << face.is_valid() << " font_name=`" << instr.get_font_name() << "`" << " base_font=`" << instr.get_base_font() << "`"; @@ -1432,7 +1173,7 @@ namespace pdflib path.close(); } */ - + const auto& rgb = instr.get_rgb_stroking(); const uint32_t stroke_color = (0xFFu << 24) | From cb21775fdff07282f8264e08114ba4fb983b0a7d Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 1 Jun 2026 05:14:05 +0200 Subject: [PATCH 6/9] make it windows compatible Signed-off-by: Peter Staar --- app/run_scaling.cpp | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/app/run_scaling.cpp b/app/run_scaling.cpp index 025578b7..94f2eacf 100644 --- a/app/run_scaling.cpp +++ b/app/run_scaling.cpp @@ -24,8 +24,6 @@ #include #include -#include - namespace { using clock_type = std::chrono::steady_clock; @@ -234,16 +232,6 @@ namespace } private: - static int terminal_width() - { - struct winsize size {}; - if(ioctl(2, TIOCGWINSZ, &size) == 0 and size.ws_col > 0) - { - return static_cast(size.ws_col); - } - return 100; - } - void draw(int current, bool force) { if(total_ <= 0 and not force) @@ -272,7 +260,7 @@ namespace const std::string suffix_text = suffix.str(); const std::string prefix = label_ + ": ["; - const int available = terminal_width() + const int available = 100 - static_cast(prefix.size()) - static_cast(suffix_text.size()); const int width = std::max(0, std::min(40, available)); From b756c165193539e6372f42c9e9bbf5ccf285619e Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 1 Jun 2026 05:54:51 +0200 Subject: [PATCH 7/9] updated the default package dependencies Signed-off-by: Peter Staar --- README.md | 6 ++++++ pyproject.toml | 6 +++--- uv.lock | 16 +++++++++++----- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index aa7d101e..327bf60d 100644 --- a/README.md +++ b/README.md @@ -191,6 +191,12 @@ The latter will only work after a clean `git clone`. If you are developing and u rm -rf .venv; uv venv; uv pip install --force-reinstall --no-deps -e ".[perf-tools]" ``` +or + +```sh +BUILD_THREADS=12 uv pip install --force-reinstall --no-deps -e ".[perf]" +``` + To test the package, run: ```sh diff --git a/pyproject.toml b/pyproject.toml index b4d6d054..1af0afa5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,11 +29,9 @@ authors = [ ] requires-python = ">=3.10" dependencies = [ - "tabulate>=0.9.0,<1.0.0", "pillow>=10.0.0,<13.0.0", "pydantic>=2.0.0", "docling-core>=2.65.2", - # "docling-core @ git+https://github.com/docling-project/docling-core.git@720a14846ac6fe9f53031305af04d50b6d8479e4", "pywin32>=305; sys_platform == 'win32'", ] [project.urls] @@ -72,13 +70,15 @@ dev = [ "boto>=2.49.0,<3.0.0", "boto3>=1.35.67,<2.0.0", "huggingface-hub>=1.11.0", + "tabulate>=0.9.0,<1.0.0", ] -perf-test = [ +perf = [ "matplotlib>=3.10.8", "pdfplumber>=0.11.7", "pymupdf>=1.26.4", "pypdfium2>=4.30.0", "huggingface-hub>=1.11.0", + "tabulate>=0.9.0,<1.0.0", ] diff --git a/uv.lock b/uv.lock index a256ac74..12973c3f 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12'", @@ -734,7 +734,6 @@ dependencies = [ { name = "pillow" }, { name = "pydantic" }, { name = "pywin32", marker = "sys_platform == 'win32'" }, - { name = "tabulate" }, ] [package.dev-dependencies] @@ -756,14 +755,16 @@ dev = [ { name = "pytest" }, { name = "python-semantic-release" }, { name = "ruff" }, + { name = "tabulate" }, { name = "tqdm" }, ] -perf-test = [ +perf = [ { name = "huggingface-hub" }, { name = "matplotlib" }, { name = "pdfplumber" }, { name = "pymupdf" }, { name = "pypdfium2" }, + { name = "tabulate" }, ] [package.metadata] @@ -772,7 +773,6 @@ requires-dist = [ { name = "pillow", specifier = ">=10.0.0,<13.0.0" }, { name = "pydantic", specifier = ">=2.0.0" }, { name = "pywin32", marker = "sys_platform == 'win32'", specifier = ">=305" }, - { name = "tabulate", specifier = ">=0.9.0,<1.0.0" }, ] [package.metadata.requires-dev] @@ -794,14 +794,16 @@ dev = [ { name = "pytest", specifier = ">=9.0.3,<10.0.0" }, { name = "python-semantic-release", specifier = ">=7.32.2,<8.0.0" }, { name = "ruff", specifier = ">=0.11,<1.0" }, + { name = "tabulate", specifier = ">=0.9.0,<1.0.0" }, { name = "tqdm", specifier = ">=4.67.0,<5.0.0" }, ] -perf-test = [ +perf = [ { name = "huggingface-hub", specifier = ">=1.11.0" }, { name = "matplotlib", specifier = ">=3.10.8" }, { name = "pdfplumber", specifier = ">=0.11.7" }, { name = "pymupdf", specifier = ">=1.26.4" }, { name = "pypdfium2", specifier = ">=4.30.0" }, + { name = "tabulate", specifier = ">=0.9.0,<1.0.0" }, ] [[package]] @@ -1832,6 +1834,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/a7/8c4f86c78ec03db954d05fd9c57a114cc3a172a2d3e4a8b949cd5ff89471/patchelf-0.17.2.4-py3-none-macosx_10_9_universal2.whl", hash = "sha256:343bb1b94e959f9070ca9607453b04390e36bbaa33c88640b989cefad0aa049e", size = 184436, upload-time = "2025-07-23T21:16:20.578Z" }, { url = "https://files.pythonhosted.org/packages/7e/19/f7821ef31aab01fa7dc8ebe697ece88ec4f7a0fdd3155dab2dfee4b00e5c/patchelf-0.17.2.4-py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:d9b35ebfada70c02679ad036407d9724ffe1255122ba4ac5e4be5868618a5689", size = 482846, upload-time = "2025-07-23T21:16:23.73Z" }, { url = "https://files.pythonhosted.org/packages/d1/50/107fea848ecfd851d473b079cab79107487d72c4c3cdb25b9d2603a24ca2/patchelf-0.17.2.4-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:2931a1b5b85f3549661898af7bf746afbda7903c7c9a967cfc998a3563f84fad", size = 477811, upload-time = "2025-07-23T21:16:25.145Z" }, + { url = "https://files.pythonhosted.org/packages/89/a9/a9a2103e159fd65bffbc21ecc5c8c36e44eb34fe53b4ef85fb6d08c2a635/patchelf-0.17.2.4-py3-none-manylinux2014_armv7l.manylinux_2_17_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:ae44cb3c857d50f54b99e5697aa978726ada33a8a6129d4b8b7ffd28b996652d", size = 431226, upload-time = "2025-07-23T21:16:26.765Z" }, + { url = "https://files.pythonhosted.org/packages/87/93/897d612f6df7cfd987bdf668425127efeff8d8e4ad8bfbab1c69d2a0d861/patchelf-0.17.2.4-py3-none-manylinux2014_ppc64le.manylinux_2_17_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:680a266a70f60a7a4f4c448482c5bdba80cc8e6bb155a49dcc24238ba49927b0", size = 540276, upload-time = "2025-07-23T21:16:27.983Z" }, + { url = "https://files.pythonhosted.org/packages/5d/b8/2b92d11533482bac9ee989081d6880845287751b5f528adbd6bb27667fbd/patchelf-0.17.2.4-py3-none-manylinux2014_s390x.manylinux_2_17_s390x.musllinux_1_1_s390x.whl", hash = "sha256:d842b51f0401460f3b1f3a3a67d2c266a8f515a5adfbfa6e7b656cb3ac2ed8bc", size = 596632, upload-time = "2025-07-23T21:16:29.253Z" }, + { url = "https://files.pythonhosted.org/packages/14/e2/975d4bdb418f942b53e6187b95bd9e0d5e0488b7bc214685a1e43e2c2751/patchelf-0.17.2.4-py3-none-manylinux_2_31_riscv64.musllinux_1_1_riscv64.whl", hash = "sha256:7076d9e127230982e20a81a6e2358d3343004667ba510d9f822d4fdee29b0d71", size = 508281, upload-time = "2025-07-23T21:16:30.865Z" }, ] [[package]] From 55b572623787a465ce7382454a9f3c1dfe573f2d Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 1 Jun 2026 06:40:23 +0200 Subject: [PATCH 8/9] upgraded idna Signed-off-by: Peter Staar --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index 12973c3f..27f27a55 100644 --- a/uv.lock +++ b/uv.lock @@ -1044,11 +1044,11 @@ wheels = [ [[package]] name = "idna" -version = "3.11" +version = "3.17" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } +sdist = { url = "https://files.pythonhosted.org/packages/b9/28/99c51f664567218d824af024c0251650fb27e4ca066df188dab0769c5b91/idna-3.17.tar.gz", hash = "sha256:5eb0cb53bc467c12eadcf6de83163ad8527cec9416f44b9b61b19caedad2b87f", size = 196048, upload-time = "2026-05-28T14:32:38.55Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, + { url = "https://files.pythonhosted.org/packages/de/a7/f76514cc40ad6234098ecdebda08732d75964776c51a42845b7da10649e2/idna-3.17-py3-none-any.whl", hash = "sha256:466e48829084efe2548012b855df21540b96f2e20e51bd124c851536556a592c", size = 65316, upload-time = "2026-05-28T14:32:37.035Z" }, ] [[package]] From 3709c8a5ebb9e19d64d0b6e43fc355fb28da005d Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 19 Jun 2026 16:12:26 +0200 Subject: [PATCH 9/9] page content config redesign Replace the split decode/materialization selection API with PageContentConfig and PageItemLevel. The wrapper now exposes DecodeConfig only for decode tuning and compiles it with PageContentConfig into the native DecodePageConfig internally. Sequential documents keep content_config overridable per page and re-decode when a richer request needs native content that was previously skipped. Threaded parsing uses one fixed batch decode mask, supports per-result COMPUTE -> MATERIALIZE upgrades, and rejects requests for entities skipped during batch decode. Also copy configs at construction to avoid mutable caller config drift, refresh README/visualize/perf callers, remove stale materialization API usage, and replace timing attribute probing with explicit pybind timing type handling. Signed-off-by: Christoph Auer --- README.md | 4 +- docling_parse/pdf_parser.py | 503 +++++++++++++++++----------------- docling_parse/visualize.py | 22 +- perf/run_analysis.py | 11 +- perf/run_perf.py | 68 +++-- perf/run_scaling.py | 154 ++++++----- tests/test_parse.py | 127 ++++++--- tests/test_threaded_parse.py | 105 +++++-- tests/test_threaded_render.py | 15 +- 9 files changed, 571 insertions(+), 438 deletions(-) diff --git a/README.md b/README.md index 327bf60d..735a74f0 100644 --- a/README.md +++ b/README.md @@ -92,17 +92,17 @@ Parse pages from one or more PDFs in parallel using a thread pool with backpress ```python from docling_parse.pdf_parser import ( + DecodeConfig, DoclingThreadedPdfParser, ThreadedPdfParserConfig, ) -from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import] parser_config = ThreadedPdfParserConfig( loglevel="fatal", threads=4, # worker threads max_concurrent_results=32 # cap buffered results to limit memory ) -decode_config = DecodePageConfig() +decode_config = DecodeConfig() parser = DoclingThreadedPdfParser( parser_config=parser_config, diff --git a/docling_parse/pdf_parser.py b/docling_parse/pdf_parser.py index e7bcc9d4..77f03c59 100644 --- a/docling_parse/pdf_parser.py +++ b/docling_parse/pdf_parser.py @@ -1,9 +1,9 @@ """Parser for PDF files""" -import copy import hashlib import logging import math +from enum import IntEnum from io import BytesIO from pathlib import Path from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union @@ -60,9 +60,11 @@ TIMING_PREFIX_DECODE_PAGE, TIMING_PREFIX_DECODE_XOBJECT, TIMING_PREFIX_DECODING_PAGE, - DecodePageConfig, # type: ignore[import] + DecodePageConfig as _DecodePageConfig, # type: ignore[import] # internal C++ struct; not public PdfPageDecoder, # type: ignore[import] RenderConfig, # type: ignore[import] + _PageDecodeTimings, # type: ignore[import] + _PageRenderTimings, # type: ignore[import] _threaded_pdf_parser, # type: ignore[import] _threaded_pdf_renderer, # type: ignore[import] get_decode_page_timing_keys, @@ -201,36 +203,125 @@ class PageRenderTimings(PageDecodeTimings): render_page_s: float = 0.0 -class PageMaterializationConfig(BaseModel): - """Controls which native page data is materialized into SegmentedPdfPage.""" +class PageItemLevel(IntEnum): + """How far a page entity travels. Ordered: SKIP < COMPUTE < MATERIALIZE.""" + + SKIP = 0 # not computed in C++, absent from SegmentedPdfPage + COMPUTE = 1 # computed/retained in C++, NOT surfaced in SegmentedPdfPage + MATERIALIZE = 2 # computed in C++ AND surfaced in SegmentedPdfPage + + +class PageContentConfig(BaseModel): + """Per-entity selection: skipped, computed-in-C++-only, or materialized. + + Field name is the entity; the value is the lifecycle level. Set per call + on get_page (with a document-level default), so a page can be escalated to + a richer level on demand (e.g. surface word cells once a table is found). + """ model_config = ConfigDict(validate_assignment=True) - materialize_char_cells: bool = True - materialize_word_cells: bool = True - materialize_line_cells: bool = True - materialize_shapes: bool = True - materialize_bitmaps: bool = True - materialize_bitmap_bytes: bool = True + char_cells: PageItemLevel = PageItemLevel.MATERIALIZE + word_cells: PageItemLevel = PageItemLevel.MATERIALIZE + line_cells: PageItemLevel = PageItemLevel.MATERIALIZE + shapes: PageItemLevel = PageItemLevel.MATERIALIZE + bitmaps: PageItemLevel = PageItemLevel.MATERIALIZE - @classmethod - def from_decode_config( - cls, decode_config: DecodePageConfig - ) -> "PageMaterializationConfig": - return cls() + include_bitmap_bytes: bool = True # only effective when bitmaps == MATERIALIZE - def cache_key(self) -> tuple[bool, bool, bool, bool, bool, bool]: + def cache_key(self) -> tuple[int, int, int, int, int, bool]: return ( - self.materialize_char_cells, - self.materialize_word_cells, - self.materialize_line_cells, - self.materialize_shapes, - self.materialize_bitmaps, - self.materialize_bitmap_bytes, + int(self.char_cells), + int(self.word_cells), + int(self.line_cells), + int(self.shapes), + int(self.bitmaps), + self.include_bitmap_bytes, ) -def _page_timings_from_raw(raw_timings) -> PageDecodeTimings | PageRenderTimings: +class DecodeConfig(BaseModel): + """How to compute page content (tuning, not selection). + + Fixed when the document is opened; the C++ page-decoder cache freezes these + at first decode, so they cannot vary per page. Entity selection lives on + PageContentConfig instead. + """ + + model_config = ConfigDict(validate_assignment=True) + + do_sanitization: bool = True + enforce_same_font: bool = True + horizontal_cell_tolerance: float = 1.0 + word_space_width_factor_for_merge: float = 0.33 + line_space_width_factor_for_merge: float = 1.0 + line_space_width_factor_for_merge_with_space: float = 0.33 + max_num_lines: int = -1 + max_num_bitmaps: int = -1 + do_thread_safe: bool = True + release_native_memory_every_n_pages: int = 0 + keep_glyphs: bool = False + keep_qpdf_warnings: bool = False + + +# C++ decode-time switch mask: (keep_char, create_word, create_line, keep_shapes, keep_bitmaps) +_DecodeMask = tuple[bool, bool, bool, bool, bool] + + +def _decode_mask(content_config: PageContentConfig) -> _DecodeMask: + """The C++ decode-time switches a content_config needs (level >= COMPUTE).""" + c = PageItemLevel.COMPUTE + return ( + content_config.char_cells >= c, + content_config.word_cells >= c, + content_config.line_cells >= c, + content_config.shapes >= c, + content_config.bitmaps >= c, + ) + + +def _compile_decode_config( + decode_config: DecodeConfig, + page_boundary: str, + decode_mask: _DecodeMask, +) -> "_DecodePageConfig": + """Compile the public (DecodeConfig, content mask) into the C++ decode config.""" + cpp = _DecodePageConfig() + cpp.page_boundary = page_boundary + cpp.do_sanitization = decode_config.do_sanitization + cpp.enforce_same_font = decode_config.enforce_same_font + cpp.horizontal_cell_tolerance = decode_config.horizontal_cell_tolerance + cpp.word_space_width_factor_for_merge = ( + decode_config.word_space_width_factor_for_merge + ) + cpp.line_space_width_factor_for_merge = ( + decode_config.line_space_width_factor_for_merge + ) + cpp.line_space_width_factor_for_merge_with_space = ( + decode_config.line_space_width_factor_for_merge_with_space + ) + cpp.max_num_lines = decode_config.max_num_lines + cpp.max_num_bitmaps = decode_config.max_num_bitmaps + cpp.do_thread_safe = decode_config.do_thread_safe + cpp.release_native_memory_every_n_pages = ( + decode_config.release_native_memory_every_n_pages + ) + cpp.keep_glyphs = decode_config.keep_glyphs + cpp.keep_qpdf_warnings = decode_config.keep_qpdf_warnings + ( + cpp.keep_char_cells, + cpp.create_word_cells, + cpp.create_line_cells, + cpp.keep_shapes, + cpp.keep_bitmaps, + ) = decode_mask + return cpp + + +def _page_timings_from_raw( + raw_timings: "_PageDecodeTimings | _PageRenderTimings", +) -> PageDecodeTimings | PageRenderTimings: + """Copy native threaded timing objects into the public Pydantic timing models.""" data = { "make_page_decoder_s": raw_timings.make_page_decoder_s, "decode_page_s": raw_timings.decode_page_s, @@ -238,7 +329,7 @@ def _page_timings_from_raw(raw_timings) -> PageDecodeTimings | PageRenderTimings "create_line_cells_s": raw_timings.create_line_cells_s, "total_s": raw_timings.total_s, } - if hasattr(raw_timings, "render_page_s"): + if isinstance(raw_timings, _PageRenderTimings): return PageRenderTimings( **data, render_page_s=raw_timings.render_page_s, @@ -432,7 +523,7 @@ def _to_hyperlinks_from_decoder(hyperlinks_container) -> List[PdfHyperlink]: def _to_bitmap_resources_from_decoder( images_container, - materialize_bitmap_bytes: bool = True, + include_bitmap_bytes: bool = True, ) -> List[BitmapResource]: result: List[BitmapResource] = [] @@ -440,7 +531,7 @@ def _to_bitmap_resources_from_decoder( image_ref = None mode = ImageRefMode.PLACEHOLDER - if materialize_bitmap_bytes: + if include_bitmap_bytes: try: image_bytes = image.get_image_as_bytes() @@ -502,15 +593,16 @@ def _to_bitmap_resources_from_decoder( def segmented_page_from_decoder( page_decoder: PdfPageDecoder, boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, - materialization_config: PageMaterializationConfig | None = None, + content_config: PageContentConfig | None = None, ) -> SegmentedPdfPage: """Convert a C++ PdfPageDecoder to a SegmentedPdfPage.""" - if materialization_config is None: - materialization_config = PageMaterializationConfig() + if content_config is None: + content_config = PageContentConfig() + MAT = PageItemLevel.MATERIALIZE char_cells = ( _to_cells_from_decoder(page_decoder.get_char_cells()) - if materialization_config.materialize_char_cells + if content_config.char_cells == MAT else [] ) @@ -525,27 +617,27 @@ def segmented_page_from_decoder( bitmap_resources=( _to_bitmap_resources_from_decoder( page_decoder.get_page_images(), - materialize_bitmap_bytes=materialization_config.materialize_bitmap_bytes, + include_bitmap_bytes=content_config.include_bitmap_bytes, ) - if materialization_config.materialize_bitmaps + if content_config.bitmaps == MAT else [] ), shapes=( _to_shapes_from_decoder(page_decoder.get_page_shapes()) - if materialization_config.materialize_shapes + if content_config.shapes == MAT else [] ), widgets=_to_widgets_from_decoder(page_decoder.get_page_widgets()), hyperlinks=_to_hyperlinks_from_decoder(page_decoder.get_page_hyperlinks()), ) - if materialization_config.materialize_word_cells and page_decoder.has_word_cells(): + if content_config.word_cells == MAT and page_decoder.has_word_cells(): segmented_page.word_cells = _to_cells_from_decoder( page_decoder.get_word_cells() ) segmented_page.has_words = len(segmented_page.word_cells) > 0 - if materialization_config.materialize_line_cells and page_decoder.has_line_cells(): + if content_config.line_cells == MAT and page_decoder.has_line_cells(): segmented_page.textline_cells = _to_cells_from_decoder( page_decoder.get_line_cells() ) @@ -575,28 +667,62 @@ def __init__( parser: "pdf_parser", key: str, boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, + decode_config: DecodeConfig | None = None, + content_config: PageContentConfig | None = None, ): self._parser: pdf_parser = parser self._key = key self._boundary_type = boundary_type + self._decode_config = (decode_config or DecodeConfig()).model_copy() + self._content_config = (content_config or PageContentConfig()).model_copy() self._pages: Dict[ - tuple[int, tuple[bool, bool, bool, bool, bool, bool]], SegmentedPdfPage + tuple[int, tuple[int, int, int, int, int, bool]], SegmentedPdfPage ] = {} + # Per page: the C++ decode-time mask the cached page-decoder satisfies. + self._decoded_masks: Dict[int, _DecodeMask] = {} self._toc: PdfTableOfContents | None = None self._meta: PdfMetaData | None = None self._annotations: PdfAnnotations | None = None - def _default_config(self) -> DecodePageConfig: - config = DecodePageConfig() - config.page_boundary = self._boundary_type.value - config.do_sanitization = False - return config + def _ensure_page_decoder( + self, page_no: int, content_config: PageContentConfig + ) -> PdfPageDecoder: + """Return a C++ page-decoder that satisfies content_config. + + The C++ page-decoder cache freezes config at first decode, so if the + cached decoder lacks cells this request needs, evict it and re-decode + with the union of old and new levels (capability grows monotonically). + """ + if not (1 <= page_no <= self.number_of_pages()): + raise ValueError( + f"incorrect page_no: {page_no} for key={self._key} " + f"(min:1, max:{self.number_of_pages()})" + ) + page = page_no - 1 + needed = _decode_mask(content_config) + have = self._decoded_masks.get(page_no) + + if have is not None and any(n and not h for n, h in zip(needed, have)): + needed = tuple(n or h for n, h in zip(needed, have)) # type: ignore[assignment] + self._parser.unload_document_page(key=self._key, page=page) + have = None + + mask = needed if have is None else have + cpp = _compile_decode_config( + self._decode_config, self._boundary_type.value, mask + ) + decoder = self._parser.get_page_decoder(key=self._key, page=page, config=cpp) + if decoder is None: + raise ValueError(f"Failed to decode page {page_no}") + self._decoded_masks[page_no] = mask + return decoder def is_loaded(self) -> bool: return self._parser.is_loaded(key=self._key) def unload(self) -> bool: self._pages.clear() + self._decoded_masks.clear() if self.is_loaded(): return self._parser.unload_document(self._key) @@ -616,6 +742,7 @@ def unload_pages(self, page_range: tuple[int, int]): self._parser.unload_document_page(key=self._key, page=page_num) for k in cache_keys: del self._pages[k] + self._decoded_masks.pop(page_no, None) def number_of_pages(self) -> int: if self.is_loaded(): @@ -663,19 +790,12 @@ def get_table_of_contents(self) -> PdfTableOfContents | None: def iterate_pages( self, *, - config: DecodePageConfig | None = None, - materialization_config: PageMaterializationConfig | None = None, + content_config: PageContentConfig | None = None, ) -> Iterator[Tuple[int, SegmentedPdfPage]]: - if config is None: - config = self._default_config() for page_no in range(self.number_of_pages()): yield ( page_no + 1, - self.get_page( - page_no + 1, - config=config, - materialization_config=materialization_config, - ), + self.get_page(page_no + 1, content_config=content_config), ) def _to_table_of_contents(self, toc: dict) -> List[PdfTableOfContents]: @@ -739,192 +859,53 @@ def get_page( self, page_no: int, *, - config: DecodePageConfig | None = None, - materialization_config: PageMaterializationConfig | None = None, + content_config: PageContentConfig | None = None, ) -> SegmentedPdfPage: - """Get page using typed API (zero-copy from C++).""" - if config is None: - config = self._default_config() - if materialization_config is None: - materialization_config = PageMaterializationConfig.from_decode_config( - config - ) - return self._get_page_typed( - page_no, config=config, materialization_config=materialization_config - ) + """Get a page as SegmentedPdfPage (zero-copy from C++). - def get_page_with_timings( - self, - page_no: int, - *, - config: DecodePageConfig | None = None, - materialization_config: PageMaterializationConfig | None = None, - ) -> Tuple[SegmentedPdfPage, Timings]: - """Get page along with timing information. - - Similar to get_page() but also returns timing data from the parsing process. - Useful for performance analysis and benchmarking. - - Note: This method does NOT use the page cache to ensure fresh timing data. - - Args: - page_no: Page number (1-indexed). - config: Page decoding configuration. If None, uses default config. - - Returns: - Tuple of (SegmentedPdfPage, Timings) with the parsed page data and timing info. + content_config overrides the document default for this page; requesting + a richer level than was previously decoded re-decodes the page. """ - if config is None: - config = self._default_config() - if materialization_config is None: - materialization_config = PageMaterializationConfig.from_decode_config( - config - ) - - if not (1 <= page_no <= self.number_of_pages()): - raise ValueError( - f"incorrect page_no: {page_no} for key={self._key} " - f"(min:1, max:{self.number_of_pages()})" - ) + cc = content_config or self._content_config + cache_key = (page_no, cc.cache_key()) + if cache_key in self._pages: + return self._pages[cache_key] - return self._get_page_with_timings_typed( - page_no, config=config, materialization_config=materialization_config - ) + decoder = self._ensure_page_decoder(page_no, cc) + page = segmented_page_from_decoder(decoder, self._boundary_type, cc) + self._pages[cache_key] = page + return page - def _get_page_with_timings_typed( + def get_page_with_timings( self, page_no: int, *, - config: DecodePageConfig, - materialization_config: PageMaterializationConfig, + content_config: PageContentConfig | None = None, ) -> Tuple[SegmentedPdfPage, Timings]: - """Get page with timings using typed API.""" - page_decoder = self._parser.get_page_decoder( - key=self._key, - page=page_no - 1, - config=config, - ) + """Get a page along with timing information. - if page_decoder is None: - raise ValueError(f"Failed to decode page {page_no}") - - segmented_page = self._to_segmented_page_from_decoder( - page_decoder=page_decoder, - materialization_config=materialization_config, + Forces a fresh decode (evicting any cached native decoder for the page) + so the returned timings reflect actual decode work, and does not store + the result in the page cache. + """ + cc = content_config or self._content_config + if 1 <= page_no <= self.number_of_pages(): + self._parser.unload_document_page(key=self._key, page=page_no - 1) + self._decoded_masks.pop(page_no, None) + decoder = self._ensure_page_decoder(page_no, cc) + segmented_page = segmented_page_from_decoder(decoder, self._boundary_type, cc) + timings = Timings( + data=dict(decoder.get_timings()), + raw_data=dict(decoder.get_timings_raw()), ) - - # Get timings from the page decoder - timings_dict = page_decoder.get_timings() - raw_timings_dict = page_decoder.get_timings_raw() - timings = Timings(data=dict(timings_dict), raw_data=dict(raw_timings_dict)) - return segmented_page, timings def load_all_pages( self, - config: DecodePageConfig | None = None, - materialization_config: PageMaterializationConfig | None = None, + content_config: PageContentConfig | None = None, ): - if config is None: - config = self._default_config() - if materialization_config is None: - materialization_config = PageMaterializationConfig.from_decode_config( - config - ) for page_no in range(1, self.number_of_pages() + 1): - self.get_page( - page_no, - config=config, - materialization_config=materialization_config, - ) - - def _to_page_geometry_from_decoder(self, page_dim) -> PdfPageGeometry: - """Convert typed PdfPageDimension to PdfPageGeometry.""" - return _to_page_geometry_from_decoder(page_dim, self._boundary_type) - - def _to_cells_from_decoder( - self, cells_container - ) -> List[Union[PdfTextCell, TextCell]]: - """Convert typed PdfCells container to list of PdfTextCell objects.""" - return _to_cells_from_decoder(cells_container) - - def _to_shapes_from_decoder(self, shapes_container) -> List[PdfShape]: - """Convert typed PdfShapes container to list of PdfShape objects.""" - return _to_shapes_from_decoder(shapes_container) - - def _to_widgets_from_decoder(self, widgets_container) -> List[PdfWidget]: - """Convert typed PdfWidgets container to list of PdfWidget objects.""" - return _to_widgets_from_decoder(widgets_container) - - def _to_hyperlinks_from_decoder(self, hyperlinks_container) -> List[PdfHyperlink]: - """Convert typed PdfHyperlinks container to list of PdfHyperlink objects.""" - return _to_hyperlinks_from_decoder(hyperlinks_container) - - def _to_bitmap_resources_from_decoder( - self, - images_container, - materialize_bitmap_bytes: bool = True, - ) -> List[BitmapResource]: - """Convert typed PdfImages container to list of BitmapResource objects.""" - return _to_bitmap_resources_from_decoder( - images_container, - materialize_bitmap_bytes=materialize_bitmap_bytes, - ) - - def _to_segmented_page_from_decoder( - self, - page_decoder, - materialization_config: PageMaterializationConfig | None = None, - ) -> SegmentedPdfPage: - """Convert typed PdfPageDecoder to SegmentedPdfPage (zero-copy path).""" - return segmented_page_from_decoder( - page_decoder=page_decoder, - boundary_type=self._boundary_type, - materialization_config=materialization_config, - ) - - def _get_page_typed( - self, - page_no: int, - *, - config: DecodePageConfig, - materialization_config: PageMaterializationConfig, - ) -> SegmentedPdfPage: - """Get page using typed API (zero-copy from C++, faster than get_page). - - This method uses direct typed bindings to C++ objects, avoiding JSON - serialization/deserialization overhead. Use this for better performance. - - Args: - page_no: Page number (1-indexed). - config: Page decoding configuration. - - Returns: - SegmentedPdfPage with the parsed page data. - """ - cache_key = (page_no, materialization_config.cache_key()) - if cache_key in self._pages: - return self._pages[cache_key] - - if 1 <= page_no <= self.number_of_pages(): - page_decoder = self._parser.get_page_decoder( - key=self._key, - page=page_no - 1, - config=config, - ) - - if page_decoder is None: - raise ValueError(f"Failed to decode page {page_no}") - - self._pages[cache_key] = self._to_segmented_page_from_decoder( - page_decoder=page_decoder, - materialization_config=materialization_config, - ) - return self._pages[cache_key] - - raise ValueError( - f"incorrect page_no: {page_no} for key={self._key} (min:1, max:{self.number_of_pages()})" - ) + self.get_page(page_no, content_config=content_config) class DoclingPdfParser: @@ -962,6 +943,8 @@ def load( lazy: bool = True, boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX, password: str | None = None, + decode_config: DecodeConfig | None = None, + content_config: PageContentConfig | None = None, ) -> PdfDocument: if isinstance(path_or_stream, str): @@ -986,7 +969,11 @@ def load( if success: result_doc = PdfDocument( - parser=self.parser, key=key, boundary_type=boundary_type + parser=self.parser, + key=key, + boundary_type=boundary_type, + decode_config=decode_config, + content_config=content_config, ) if not lazy: # eagerly parse the pages at init time if desired result_doc.load_all_pages() @@ -1043,7 +1030,7 @@ class ThreadedPdfParserConfig(BaseModel): max_concurrent_results: int = 32 boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX render_config: RenderConfig | None = None - page_materialization_config: PageMaterializationConfig | None = None + page_content_config: PageContentConfig | None = None class PageParseResult: @@ -1055,17 +1042,15 @@ def __init__( *, boundary_type: PdfPageBoundaryType, render_config: RenderConfig | None, - decode_config: DecodePageConfig, - materialization_config: PageMaterializationConfig, + content_config: PageContentConfig, + batch_decode_mask: _DecodeMask, ): self._raw = raw_result self._boundary_type = boundary_type self._render_config = render_config - self._decode_config = decode_config - self._materialization_config = materialization_config - self._pages: Dict[ - tuple[bool, bool, bool, bool, bool, bool], SegmentedPdfPage - ] = {} + self._content_config = content_config + self._batch_decode_mask = batch_decode_mask + self._pages: Dict[tuple[int, int, int, int, int, bool], SegmentedPdfPage] = {} self._page_decoder: PdfPageDecoder | None = None self._default_image: PILImage.Image | None = None @@ -1108,18 +1093,36 @@ def _require_page_decoder(self) -> PdfPageDecoder: def get_page( self, - materialization_config: PageMaterializationConfig | None = None, + content_config: PageContentConfig | None = None, ) -> SegmentedPdfPage: - """Return the parsed page, converting lazily on first access.""" - if materialization_config is None: - materialization_config = self._materialization_config - - cache_key = materialization_config.cache_key() + """Return the parsed page, converting lazily on first access. + + The threaded batch decodes once, so content_config here only changes + Python materialization, not the C++ decode. Any entity the batch + computed (level >= COMPUTE) may be raised to MATERIALIZE per result — + this is the intended pattern: decode the batch at COMPUTE, then surface + cells only on the pages that need them. You may also lower emit or + toggle include_bitmap_bytes. The only rejected case is requesting an + entity the batch skipped (never computed in C++): that cannot be + recovered without a re-decode and would otherwise yield empty data, so + set it on ThreadedPdfParserConfig.page_content_config instead. + """ + cc = content_config or self._content_config + if content_config is not None: + if any( + r and not b for r, b in zip(_decode_mask(cc), self._batch_decode_mask) + ): + raise ValueError( + "content_config requests an entity the batch skipped at " + "decode time; raise it on " + "ThreadedPdfParserConfig.page_content_config (to >= COMPUTE)" + ) + cache_key = cc.cache_key() if cache_key not in self._pages: self._pages[cache_key] = segmented_page_from_decoder( page_decoder=self._require_page_decoder(), boundary_type=self._boundary_type, - materialization_config=materialization_config, + content_config=cc, ) return self._pages[cache_key] @@ -1301,7 +1304,7 @@ class DoclingThreadedPdfParser: def __init__( self, parser_config: ThreadedPdfParserConfig | None = None, - decode_config: DecodePageConfig | None = None, + decode_config: DecodeConfig | None = None, ): if parser_config is None: parser_config = ThreadedPdfParserConfig() @@ -1311,16 +1314,16 @@ def __init__( parser_config.render_config = _validated_render_config( parser_config.render_config ) - self._decode_config = ( - copy.copy(decode_config) - if decode_config is not None - else DecodePageConfig() - ) - self._decode_config.page_boundary = parser_config.boundary_type.value - self._materialization_config = ( - parser_config.page_materialization_config - if parser_config.page_materialization_config is not None - else PageMaterializationConfig.from_decode_config(self._decode_config) + self._decode_config = (decode_config or DecodeConfig()).model_copy() + self._content_config = ( + parser_config.page_content_config or PageContentConfig() + ).model_copy() + self._batch_decode_mask = _decode_mask(self._content_config) + # The threaded C++ parser decodes the whole batch with one fixed config. + self._cpp_decode_config = _compile_decode_config( + self._decode_config, + parser_config.boundary_type.value, + self._batch_decode_mask, ) self._page_counts: Dict[str, int] = {} self._scheduled_page_counts: Dict[str, int] = {} @@ -1330,14 +1333,14 @@ def __init__( loglevel=parser_config.loglevel, num_threads=parser_config.threads, max_concurrent_results=parser_config.max_concurrent_results, - config=self._decode_config, + config=self._cpp_decode_config, ) else: self._parser = _threaded_pdf_renderer( loglevel=parser_config.loglevel, num_threads=parser_config.threads, max_concurrent_results=parser_config.max_concurrent_results, - decode_config=self._decode_config, + decode_config=self._cpp_decode_config, render_config=parser_config.render_config, ) @@ -1446,6 +1449,6 @@ def get_task(self) -> "PageParseResult": self._parser.get_task(), boundary_type=self._parser_config.boundary_type, render_config=self._parser_config.render_config, - decode_config=self._decode_config, - materialization_config=self._materialization_config, + content_config=self._content_config, + batch_decode_mask=self._batch_decode_mask, ) diff --git a/docling_parse/visualize.py b/docling_parse/visualize.py index 3c2a532c..7ef7bfad 100644 --- a/docling_parse/visualize.py +++ b/docling_parse/visualize.py @@ -4,10 +4,13 @@ from pathlib import Path from typing import Optional -from docling_core.types.doc.page import SegmentedPdfPage, TextCellUnit +from docling_core.types.doc.page import ( + PdfPageBoundaryType, + SegmentedPdfPage, + TextCellUnit, +) -from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument -from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import] +from docling_parse.pdf_parser import DecodeConfig, DoclingPdfParser, PdfDocument def parse_args(): @@ -151,7 +154,11 @@ def visualise_py( parser = DoclingPdfParser(loglevel=log_level) pdf_doc: PdfDocument = parser.load( - path_or_stream=pdf_path, lazy=True, password=password + path_or_stream=pdf_path, + lazy=True, + password=password, + boundary_type=PdfPageBoundaryType(page_boundary), + decode_config=DecodeConfig(enforce_same_font=enforce_same_font), ) page_nos = [page_num] @@ -161,12 +168,7 @@ def visualise_py( for page_no in page_nos: print(f"parsing {pdf_path} on page: {page_no}") - config = DecodePageConfig() - config.enforce_same_font = enforce_same_font - pdf_page: SegmentedPdfPage = pdf_doc.get_page( - page_no=page_no, - config=config, - ) + pdf_page: SegmentedPdfPage = pdf_doc.get_page(page_no=page_no) if os.path.exists(str(output_dir)): pdf_page.save_as_json( diff --git a/perf/run_analysis.py b/perf/run_analysis.py index b2ecc823..b82ee501 100644 --- a/perf/run_analysis.py +++ b/perf/run_analysis.py @@ -41,8 +41,6 @@ get_static_timing_keys, is_static_timing_key, ) -from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import] - # -------------- Data types -------------- @@ -113,15 +111,10 @@ def timestamped_out_path(prefix: str = "analysis") -> Path: def extract_timings_for_page( doc, page_number: int, - *, - config: DecodePageConfig | None = None, ) -> Timings: """Run docling-parse on the given page and return Timings object.""" try: - _, timings = doc.get_page_with_timings( - page_number, - config=config, - ) + _, timings = doc.get_page_with_timings(page_number) return timings except Exception: return Timings() @@ -212,7 +205,7 @@ def write_static_timings_csv(out_path: Path, pages: List[PageTimings]) -> None: # Get decode_page keys in order (excludes the global decode_page timer) decode_page_keys = get_decode_page_timing_keys() - header = ["filename", "page_number", "elapsed_original_sec"] + decode_page_keys + header = ["filename", "page_number", "elapsed_original_sec", *decode_page_keys] with out_path.open("w", newline="") as f: w = csv.writer(f) diff --git a/perf/run_perf.py b/perf/run_perf.py index a50e77de..c9572bd0 100644 --- a/perf/run_perf.py +++ b/perf/run_perf.py @@ -28,9 +28,9 @@ from pathlib import Path from statistics import mean, median from typing import Callable, Iterable, List, Tuple -from tqdm import tqdm -from tabulate import tabulate +from tabulate import tabulate +from tqdm import tqdm # -------- Utilities -------- @@ -94,11 +94,24 @@ def _get_docling_static_timing_keys() -> List[str]: def parse_with_docling(use_bytesio: bool = False) -> Callable[[Path], Iterable[Row]]: def _runner(pdf_path: Path) -> Iterable[Row]: from io import BytesIO - from docling_parse.pdf_parser import DoclingPdfParser - from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import] + from docling_core.types.doc.page import PdfPageBoundaryType + from docling_parse.pdf_parser import ( + DecodeConfig, + DoclingPdfParser, + PageContentConfig, + PageItemLevel, + ) + timing_keys = _get_docling_static_timing_keys() + content_config = PageContentConfig( + char_cells=PageItemLevel.SKIP, + word_cells=PageItemLevel.SKIP, + line_cells=PageItemLevel.MATERIALIZE, + shapes=PageItemLevel.SKIP, + bitmaps=PageItemLevel.SKIP, + ) rows: List[Row] = [] try: @@ -111,6 +124,8 @@ def _runner(pdf_path: Path) -> Iterable[Row]: source, lazy=True, boundary_type=PdfPageBoundaryType.CROP_BOX, + decode_config=DecodeConfig(), + content_config=content_config, ) try: n = doc.number_of_pages() @@ -124,16 +139,7 @@ def _runner(pdf_path: Path) -> Iterable[Row]: ok = True detail: dict = {} try: - perf_config = DecodePageConfig() - perf_config.keep_char_cells = False - perf_config.keep_shapes = False - perf_config.keep_bitmaps = False - perf_config.create_word_cells = False - perf_config.create_line_cells = True - _, timings_obj = doc.get_page_with_timings( - page_idx, - config=perf_config, - ) + _, timings_obj = doc.get_page_with_timings(page_idx) static_t = timings_obj.get_static_timings() for key in timing_keys: detail[key] = static_t.get(key, 0.0) @@ -178,7 +184,7 @@ def parse_with_pdfplumber(pdf_path: Path) -> Iterable[Row]: ok = False err = str(e) print(f"error: {err}") - + t1 = time.perf_counter() rows.append(Row(str(pdf_path), idx + 1, t1 - t0, ok, err)) except Exception as e: # pragma: no cover @@ -209,9 +215,9 @@ def parse_with_pypdfium2(pdf_path: Path) -> Iterable[Row]: text_page = page.get_textpage() # _ = textpage.get_text_range() # extract all page text - for l in range(text_page.count_rects()): - rect = text_page.get_rect(l) - text_piece = text_page.get_text_bounded(*rect) + for rect_idx in range(text_page.count_rects()): + rect = text_page.get_rect(rect_idx) + _ = text_page.get_text_bounded(*rect) # x0, y0, x1, y1 = rect # print(f"{rect}: {text_piece}") @@ -221,7 +227,7 @@ def parse_with_pypdfium2(pdf_path: Path) -> Iterable[Row]: ok = False err = str(e) print(f"error: {err}") - + t1 = time.perf_counter() rows.append(Row(str(pdf_path), i + 1, t1 - t0, ok, err)) finally: @@ -279,33 +285,37 @@ def parse_with_docling_threaded( def _runner(pdf_paths: List[Path]) -> Tuple[List[Row], float]: from docling_parse.pdf_parser import ( + DecodeConfig, DoclingThreadedPdfParser, + PageContentConfig, + PageItemLevel, ThreadedPdfParserConfig, ) - from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import] - decode_config = DecodePageConfig() - decode_config.keep_char_cells = False - decode_config.keep_shapes = False - decode_config.keep_bitmaps = False - decode_config.create_word_cells = False - decode_config.create_line_cells = True + content_config = PageContentConfig( + char_cells=PageItemLevel.SKIP, + word_cells=PageItemLevel.SKIP, + line_cells=PageItemLevel.MATERIALIZE, + shapes=PageItemLevel.SKIP, + bitmaps=PageItemLevel.SKIP, + ) parser_config = ThreadedPdfParserConfig( loglevel="fatal", threads=num_threads, max_concurrent_results=max_concurrent_results, + page_content_config=content_config, ) parser = DoclingThreadedPdfParser( parser_config=parser_config, - decode_config=decode_config, + decode_config=DecodeConfig(), ) for pdf_path in pdf_paths: try: parser.load(str(pdf_path)) - except Exception as e: + except Exception: pass # will surface as missing results below rows: List[Row] = [] @@ -364,7 +374,7 @@ def compute_stats(rows: List[Row]) -> dict: failed_pages = total_pages - ok_pages total_time = sum(times) stats = { - "files": len(set(r.filename for r in rows)), + "files": len({r.filename for r in rows}), "pages_total": total_pages, "pages_ok": ok_pages, "pages_failed": failed_pages, diff --git a/perf/run_scaling.py b/perf/run_scaling.py index 99ea534e..8360d6d8 100644 --- a/perf/run_scaling.py +++ b/perf/run_scaling.py @@ -38,8 +38,6 @@ from pathlib import Path from typing import List, Tuple -from docling_core.types.doc.page import SegmentedPdfPage -from PIL import Image as PILImage from tabulate import tabulate from tqdm import tqdm @@ -212,26 +210,51 @@ def _materializes_page_data(materialization_options: dict[str, bool]) -> bool: ) -def _decode_config(decode_options: dict[str, bool]): - from docling_parse.pdf_parsers import DecodePageConfig # type: ignore[import] +def _decode_config(): + from docling_parse.pdf_parser import DecodeConfig - c = DecodePageConfig() - c.keep_char_cells = decode_options["keep_char_cells"] - c.keep_shapes = decode_options["keep_shapes"] - c.keep_bitmaps = decode_options["keep_bitmaps"] - c.create_word_cells = decode_options["create_word_cells"] - c.create_line_cells = decode_options["create_line_cells"] - return c + return DecodeConfig() -def _materialization_config(materialization_options: dict[str, bool]): - from docling_parse.pdf_parser import PageMaterializationConfig +def _content_config( + decode_options: dict[str, bool], materialization_options: dict[str, bool] +): + from docling_parse.pdf_parser import PageContentConfig, PageItemLevel - return PageMaterializationConfig(**materialization_options) + def _level(keep: bool, materialize: bool) -> PageItemLevel: + if materialize: + return PageItemLevel.MATERIALIZE + if keep: + return PageItemLevel.COMPUTE + return PageItemLevel.SKIP + + return PageContentConfig( + char_cells=_level( + decode_options["keep_char_cells"], + materialization_options["materialize_char_cells"], + ), + word_cells=_level( + decode_options["create_word_cells"], + materialization_options["materialize_word_cells"], + ), + line_cells=_level( + decode_options["create_line_cells"], + materialization_options["materialize_line_cells"], + ), + shapes=_level( + decode_options["keep_shapes"], + materialization_options["materialize_shapes"], + ), + bitmaps=_level( + decode_options["keep_bitmaps"], + materialization_options["materialize_bitmaps"], + ), + include_bitmap_bytes=materialization_options["materialize_bitmap_bytes"], + ) -def _config_rows(config, fields: List[str]) -> List[List[str]]: - return [[field, getattr(config, field)] for field in fields] +def _config_rows(values: dict[str, object], fields: List[str]) -> List[List[str]]: + return [[field, values[field]] for field in fields] def _print_run_configs( @@ -243,22 +266,16 @@ def _print_run_configs( ) -> None: from docling_parse.pdf_parsers import RenderConfig # type: ignore[import] - decode_config = _decode_config(decode_options) + decode_config = _decode_config() decode_fields = [ - "page_boundary", "do_sanitization", - "keep_char_cells", - "keep_shapes", - "keep_bitmaps", - "max_num_lines", - "max_num_bitmaps", - "create_word_cells", - "create_line_cells", "enforce_same_font", "horizontal_cell_tolerance", "word_space_width_factor_for_merge", "line_space_width_factor_for_merge", "line_space_width_factor_for_merge_with_space", + "max_num_lines", + "max_num_bitmaps", "do_thread_safe", "release_native_memory_every_n_pages", "keep_glyphs", @@ -267,25 +284,25 @@ def _print_run_configs( print("Decode config:") print( tabulate( - _config_rows(decode_config, decode_fields), + _config_rows(decode_config.model_dump(), decode_fields), headers=["parameter", "value"], ) ) print() - materialization_config = _materialization_config(materialization_options) - materialization_fields = [ - "materialize_char_cells", - "materialize_word_cells", - "materialize_line_cells", - "materialize_shapes", - "materialize_bitmaps", - "materialize_bitmap_bytes", + content_config = _content_config(decode_options, materialization_options) + content_fields = [ + "char_cells", + "word_cells", + "line_cells", + "shapes", + "bitmaps", + "include_bitmap_bytes", ] - print("Materialization config:") + print("Content config:") print( tabulate( - _config_rows(materialization_config, materialization_fields), + _config_rows(content_config.model_dump(), content_fields), headers=["parameter", "value"], ) ) @@ -298,6 +315,17 @@ def _print_run_configs( render_config = RenderConfig() render_config.scale = scale + render_values = { + "render_text": render_config.render_text, + "draw_text_bbox": render_config.draw_text_bbox, + "draw_text_basepoint": render_config.draw_text_basepoint, + "fit_glyph_bbox_to_target": render_config.fit_glyph_bbox_to_target, + "resolve_fonts": render_config.resolve_fonts, + "font_similarity_cutoff": render_config.font_similarity_cutoff, + "scale": render_config.scale, + "canvas_width": render_config.canvas_width, + "canvas_height": render_config.canvas_height, + } render_fields = [ "render_text", "draw_text_bbox", @@ -311,7 +339,7 @@ def _print_run_configs( ] print( tabulate( - _config_rows(render_config, render_fields), + _config_rows(render_values, render_fields), headers=["parameter", "value"], ) ) @@ -349,13 +377,17 @@ def _timing_csv_row( } timing_keys = _timing_csv_fieldnames()[7:-1] if result.success: + from docling_parse.pdf_parser import PageRenderTimings + timings = result.timings row["timing_total_s"] = timings.total_s row["timing_make_page_decoder_s"] = timings.make_page_decoder_s row["timing_decode_page_s"] = timings.decode_page_s row["timing_create_word_cells_s"] = timings.create_word_cells_s row["timing_create_line_cells_s"] = timings.create_line_cells_s - row["timing_render_page_s"] = getattr(timings, "render_page_s", 0.0) + row["timing_render_page_s"] = ( + timings.render_page_s if isinstance(timings, PageRenderTimings) else 0.0 + ) else: row["timing_total_s"] = 0.0 for key in timing_keys: @@ -374,9 +406,9 @@ def run_sequential_parse( """Sequential DoclingPdfParser decode (no render). Returns wall time in seconds.""" from docling_parse.pdf_parser import DoclingPdfParser - config = _decode_config(decode_options) + config = _decode_config() config.do_thread_safe = False # no need for isolated QPDF per page - materialization_config = _materialization_config(materialization_options) + content_config = _content_config(decode_options, materialization_options) parser = DoclingPdfParser(loglevel="fatal") @@ -385,20 +417,18 @@ def run_sequential_parse( pdf_schedule, desc=" sequential parse", unit="doc", leave=False ): try: - doc = parser.load(str(pdf_path), lazy=True) + doc = parser.load( + str(pdf_path), + lazy=True, + decode_config=config, + content_config=content_config, + ) if page_numbers is None: - for _, _ in doc.iterate_pages( - config=config, - materialization_config=materialization_config, - ): + for _, _ in doc.iterate_pages(): pass else: for page_number in page_numbers: - _ = doc.get_page( - page_number, - config=config, - materialization_config=materialization_config, - ) + _ = doc.get_page(page_number) doc.unload() except Exception as e: print(f" sequential error on {pdf_path}: {e}") @@ -435,8 +465,8 @@ def run_pypdfium_parse( try: page = doc[i] text_page = page.get_textpage() - for l in range(text_page.count_rects()): - rect = text_page.get_rect(l) + for rect_idx in range(text_page.count_rects()): + rect = text_page.get_rect(rect_idx) _ = text_page.get_text_bounded(*rect) text_page.close() page.close() @@ -484,8 +514,8 @@ def run_pypdfium_render( try: page = doc[i] text_page = page.get_textpage() - for l in range(text_page.count_rects()): - rect = text_page.get_rect(l) + for rect_idx in range(text_page.count_rects()): + rect = text_page.get_rect(rect_idx) _ = text_page.get_text_bounded(*rect) text_page.close() bitmap = page.render(scale=2) @@ -642,15 +672,15 @@ def run_threaded( timing_csv: Path, ) -> float: """Run DoclingThreadedPdfParser; render=True enables rasterisation.""" + from docling_parse.pdf_parsers import RenderConfig # type: ignore[import] + from docling_parse.pdf_parser import ( DoclingThreadedPdfParser, - PageMaterializationConfig, ThreadedPdfParserConfig, ) - from docling_parse.pdf_parsers import RenderConfig # type: ignore[import] - decode_config = _decode_config(decode_options) - materialization_config = PageMaterializationConfig(**materialization_options) + decode_config = _decode_config() + content_config = _content_config(decode_options, materialization_options) materialize_page = _materializes_page_data(materialization_options) render_config = None @@ -663,7 +693,7 @@ def run_threaded( threads=num_threads, max_concurrent_results=max_concurrent_results, render_config=render_config, - page_materialization_config=materialization_config, + page_content_config=content_config, ) parser = DoclingThreadedPdfParser( @@ -700,9 +730,9 @@ def run_threaded( for result in parser.iterate_results(): if result.success: if render: - img: PILImage = result.get_image() + result.get_image() if materialize_page: - page: SegmentedPdfPage = result.get_page() + result.get_page() """ assert len(page.shapes)==0, "len(page.shapes)==0" @@ -713,7 +743,7 @@ def run_threaded( """ else: if materialize_page: - page = result.get_page() + result.get_page() else: errors += 1 diff --git a/tests/test_parse.py b/tests/test_parse.py index 3b66d1ce..9a59997f 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -21,9 +21,10 @@ from pydantic import TypeAdapter from docling_parse.pdf_parser import ( - DecodePageConfig, + DecodeConfig, DoclingPdfParser, - PageMaterializationConfig, + PageContentConfig, + PageItemLevel, PdfDocument, ) @@ -398,9 +399,9 @@ def test_reference_documents_from_filenames(): "font_10.pdf": [1], } - config = DecodePageConfig() - config.keep_glyphs = True - config.keep_qpdf_warnings = False + config = DecodeConfig( + keep_glyphs=True, keep_qpdf_warnings=False, do_sanitization=True + ) # Each entry: (doc_name, page_no_str, success, error_msg) results: List[tuple] = [] @@ -414,6 +415,7 @@ def test_reference_documents_from_filenames(): path_or_stream=pdf_doc_path, boundary_type=PdfPageBoundaryType.CROP_BOX, # default: CROP_BOX lazy=True, + decode_config=config, ) assert pdf_doc is not None except Exception as exc: @@ -422,7 +424,7 @@ def test_reference_documents_from_filenames(): # PdfDocument.iterate_pages() will automatically populate pages as they are yielded. # No need to call PdfDocument.load_all_pages() before. - for page_no, pred_page in pdf_doc.iterate_pages(config=config): + for page_no, pred_page in pdf_doc.iterate_pages(): print(f" -> Page {page_no} has {len(pred_page.textline_cells)} cells.") fname = os.path.join( @@ -1029,28 +1031,22 @@ def test_annotations_match_groundtruth(): BITMAP_PDF = "tests/data/regression/annots_01.pdf" -def _make_bitmap_config() -> DecodePageConfig: - config = DecodePageConfig() - config.keep_bitmaps = True - config.do_sanitization = False - return config +def _make_bitmap_config() -> DecodeConfig: + return DecodeConfig(do_sanitization=False) def test_bitmap_no_materialization_preserves_geometry(): """bitmap_resources count and rects match regardless of bitmap bytes.""" parser = DoclingPdfParser(loglevel="fatal") - pdf_doc = parser.load(path_or_stream=BITMAP_PDF, lazy=True) + pdf_doc = parser.load( + path_or_stream=BITMAP_PDF, lazy=True, decode_config=_make_bitmap_config() + ) - config = _make_bitmap_config() - materialize_full = PageMaterializationConfig(materialize_bitmap_bytes=True) - materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) + materialize_full = PageContentConfig(include_bitmap_bytes=True) + materialize_geo = PageContentConfig(include_bitmap_bytes=False) - page_full = pdf_doc.get_page( - 1, config=config, materialization_config=materialize_full - ) - page_geo = pdf_doc.get_page( - 1, config=config, materialization_config=materialize_geo - ) + page_full = pdf_doc.get_page(1, content_config=materialize_full) + page_geo = pdf_doc.get_page(1, content_config=materialize_geo) assert len(page_full.bitmap_resources) == len(page_geo.bitmap_resources), ( "bitmap count must match between full and geometry-only modes" @@ -1075,16 +1071,17 @@ def test_bitmap_no_materialization_preserves_geometry(): def test_bitmap_no_materialization_has_no_image(): - """materialize_bitmap_bytes=False produces placeholders with image=None.""" + """include_bitmap_bytes=False produces placeholders with image=None.""" from docling_core.types.doc.base import ImageRefMode parser = DoclingPdfParser(loglevel="fatal") - pdf_doc = parser.load(path_or_stream=BITMAP_PDF, lazy=True) - - config = _make_bitmap_config() - materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) + pdf_doc = parser.load( + path_or_stream=BITMAP_PDF, lazy=True, decode_config=_make_bitmap_config() + ) - page = pdf_doc.get_page(1, config=config, materialization_config=materialize_geo) + page = pdf_doc.get_page( + 1, content_config=PageContentConfig(include_bitmap_bytes=False) + ) assert len(page.bitmap_resources) > 0, "test PDF must contain bitmaps" for bm in page.bitmap_resources: @@ -1101,17 +1098,15 @@ def test_bitmap_materialization_cache_false_then_true(): from docling_core.types.doc.base import ImageRefMode parser = DoclingPdfParser(loglevel="fatal") - pdf_doc = parser.load(path_or_stream=BITMAP_PDF, lazy=True) - - config = _make_bitmap_config() - materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) - materialize_full = PageMaterializationConfig(materialize_bitmap_bytes=True) + pdf_doc = parser.load( + path_or_stream=BITMAP_PDF, lazy=True, decode_config=_make_bitmap_config() + ) page_geo = pdf_doc.get_page( - 1, config=config, materialization_config=materialize_geo + 1, content_config=PageContentConfig(include_bitmap_bytes=False) ) page_full = pdf_doc.get_page( - 1, config=config, materialization_config=materialize_full + 1, content_config=PageContentConfig(include_bitmap_bytes=True) ) for bm in page_geo.bitmap_resources: @@ -1130,17 +1125,15 @@ def test_bitmap_materialization_cache_true_then_false(): from docling_core.types.doc.base import ImageRefMode parser = DoclingPdfParser(loglevel="fatal") - pdf_doc = parser.load(path_or_stream=BITMAP_PDF, lazy=True) - - config = _make_bitmap_config() - materialize_full = PageMaterializationConfig(materialize_bitmap_bytes=True) - materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) + pdf_doc = parser.load( + path_or_stream=BITMAP_PDF, lazy=True, decode_config=_make_bitmap_config() + ) page_full = pdf_doc.get_page( - 1, config=config, materialization_config=materialize_full + 1, content_config=PageContentConfig(include_bitmap_bytes=True) ) page_geo = pdf_doc.get_page( - 1, config=config, materialization_config=materialize_geo + 1, content_config=PageContentConfig(include_bitmap_bytes=False) ) assert any(bm.image is not None for bm in page_full.bitmap_resources), ( @@ -1152,3 +1145,55 @@ def test_bitmap_materialization_cache_true_then_false(): assert bm.mode == ImageRefMode.PLACEHOLDER pdf_doc.unload() + + +# --- PageContentConfig redesign --------------------------------------------- + +TEXT_PDF = "docs/dln-v1.pdf" + + +def test_word_cells_materialize_without_char_cells(): + """Word cells can be produced without surfacing character cells.""" + parser = DoclingPdfParser(loglevel="fatal") + + skip = PageItemLevel.SKIP + mat = PageItemLevel.MATERIALIZE + + # word_cells MATERIALIZE, char_cells SKIP -> words present, no char cells. + pdf_doc = parser.load( + path_or_stream=TEXT_PDF, + lazy=True, + content_config=PageContentConfig(char_cells=skip, word_cells=mat), + ) + page = pdf_doc.get_page(1) + assert len(page.word_cells) > 0, "words must be present when word_cells=MATERIALIZE" + assert len(page.char_cells) == 0, "char cells must be absent when char_cells=SKIP" + + # char_cells SKIP -> empty; word_cells SKIP -> empty. + page2 = pdf_doc.get_page( + 1, content_config=PageContentConfig(char_cells=skip, word_cells=skip) + ) + assert len(page2.char_cells) == 0 + assert len(page2.word_cells) == 0 + pdf_doc.unload() + + +def test_content_escalation_redecodes_page(): + """Opening without word cells, then requesting them, re-decodes the page.""" + parser = DoclingPdfParser(loglevel="fatal") + pdf_doc = parser.load( + path_or_stream=TEXT_PDF, + lazy=True, + content_config=PageContentConfig(word_cells=PageItemLevel.SKIP), + ) + + page_no_words = pdf_doc.get_page(1) + assert len(page_no_words.word_cells) == 0, "words skipped at document default" + + page_words = pdf_doc.get_page( + 1, content_config=PageContentConfig(word_cells=PageItemLevel.MATERIALIZE) + ) + assert len(page_words.word_cells) > 0, ( + "escalation must re-decode the page and surface word cells" + ) + pdf_doc.unload() diff --git a/tests/test_threaded_parse.py b/tests/test_threaded_parse.py index 0fb98e98..eb6bafa3 100644 --- a/tests/test_threaded_parse.py +++ b/tests/test_threaded_parse.py @@ -7,12 +7,12 @@ import pytest from docling_core.types.doc.page import PdfPageBoundaryType, SegmentedPdfPage -from docling_parse import pdf_parsers from docling_parse.pdf_parser import ( - DecodePageConfig, + DecodeConfig, DoclingPdfParser, DoclingThreadedPdfParser, - PageMaterializationConfig, + PageContentConfig, + PageItemLevel, ThreadedPdfParserConfig, ) from tests.test_parse import ( @@ -25,20 +25,12 @@ LARGE_SAMPLE_PDF = "docs/PDF32000_2008.pdf" -def _make_decode_config() -> DecodePageConfig: - config = DecodePageConfig() - config.page_boundary = "crop_box" - config.do_sanitization = False - config.keep_glyphs = True - config.keep_qpdf_warnings = False - return config - - -def test_threaded_raw_pybind_types_are_internal(): - assert not hasattr(pdf_parsers, "PageDecodeResult") - assert not hasattr(pdf_parsers, "threaded_pdf_parser") - assert not hasattr(pdf_parsers, "PageRenderResult") - assert not hasattr(pdf_parsers, "threaded_pdf_renderer") +def _make_decode_config() -> DecodeConfig: + return DecodeConfig( + do_sanitization=False, + keep_glyphs=True, + keep_qpdf_warnings=False, + ) def test_threaded_reference_documents_from_filenames(): @@ -143,10 +135,11 @@ def test_threaded_results_match_sequential(): path_or_stream=filename, boundary_type=PdfPageBoundaryType.CROP_BOX, lazy=True, + decode_config=decode_config, ) key = f"key={filename}" sequential_pages[key] = {} - for page_no, page in pdf_doc.iterate_pages(config=decode_config): + for page_no, page in pdf_doc.iterate_pages(): sequential_pages[key][page_no] = page threaded_parser = DoclingThreadedPdfParser( @@ -317,11 +310,8 @@ def test_threaded_unload_during_active_iteration_raises(): BITMAP_PDF = "tests/data/regression/annots_01.pdf" -def _make_bitmap_config() -> DecodePageConfig: - config = DecodePageConfig() - config.keep_bitmaps = True - config.do_sanitization = False - return config +def _make_bitmap_config() -> DecodeConfig: + return DecodeConfig(do_sanitization=False) def test_threaded_bitmap_no_materialization_preserves_geometry(): @@ -329,11 +319,11 @@ def test_threaded_bitmap_no_materialization_preserves_geometry(): from docling_core.types.doc.base import ImageRefMode config = _make_bitmap_config() - materialize_full = PageMaterializationConfig(materialize_bitmap_bytes=True) - materialize_geo = PageMaterializationConfig(materialize_bitmap_bytes=False) + materialize_full = PageContentConfig(include_bitmap_bytes=True) + materialize_geo = PageContentConfig(include_bitmap_bytes=False) def _get_page1( - materialization_config: PageMaterializationConfig, + content_config: PageContentConfig, ) -> "SegmentedPdfPage": parser = DoclingThreadedPdfParser( parser_config=ThreadedPdfParserConfig(loglevel="fatal", threads=2), @@ -341,7 +331,7 @@ def _get_page1( ) parser.load(BITMAP_PDF) return next( - r.get_page(materialization_config) + r.get_page(content_config) for r in parser.iterate_results() if r.success and r.page_number == 1 ) @@ -367,3 +357,64 @@ def _get_page1( assert bm.mode == ImageRefMode.PLACEHOLDER assert any(bm.image is not None for bm in page_full.bitmap_resources) + + +def _first_successful_result(parser): + return next(r for r in parser.iterate_results() if r.success) + + +def test_threaded_result_upgrade_compute_to_materialize(): + """A batch decoded at COMPUTE can surface those cells per result.""" + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( + loglevel="fatal", + threads=2, + page_content_config=PageContentConfig(word_cells=PageItemLevel.COMPUTE), + ), + decode_config=_make_decode_config(), + ) + parser.load(SAMPLE_PDF) + result = _first_successful_result(parser) + + # Batch default emit is COMPUTE -> not surfaced. + assert len(result.get_page().word_cells) == 0 + # Upgrade COMPUTE -> MATERIALIZE: cells were computed in C++, now surfaced. + upgraded = result.get_page(PageContentConfig(word_cells=PageItemLevel.MATERIALIZE)) + assert len(upgraded.word_cells) > 0 + + +def test_threaded_result_rejects_skipped_entity(): + """Requesting an entity the batch skipped raises instead of yielding empty.""" + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( + loglevel="fatal", + threads=2, + page_content_config=PageContentConfig(word_cells=PageItemLevel.SKIP), + ), + decode_config=_make_decode_config(), + ) + parser.load(SAMPLE_PDF) + result = _first_successful_result(parser) + + with pytest.raises(ValueError, match="batch skipped"): + result.get_page(PageContentConfig(word_cells=PageItemLevel.MATERIALIZE)) + + +def test_threaded_result_rejects_skipped_entity_after_config_mutation(): + """The rejection uses the compiled batch mask, not the caller's mutable config.""" + content_config = PageContentConfig(word_cells=PageItemLevel.SKIP) + parser = DoclingThreadedPdfParser( + parser_config=ThreadedPdfParserConfig( + loglevel="fatal", + threads=2, + page_content_config=content_config, + ), + decode_config=_make_decode_config(), + ) + content_config.word_cells = PageItemLevel.MATERIALIZE + + parser.load(SAMPLE_PDF) + result = _first_successful_result(parser) + + with pytest.raises(ValueError, match="batch skipped"): + result.get_page(PageContentConfig(word_cells=PageItemLevel.MATERIALIZE)) diff --git a/tests/test_threaded_render.py b/tests/test_threaded_render.py index eb18fdac..1f2e8117 100644 --- a/tests/test_threaded_render.py +++ b/tests/test_threaded_render.py @@ -12,7 +12,7 @@ from PIL import Image as PILImage from docling_parse.pdf_parser import ( - DecodePageConfig, + DecodeConfig, DoclingThreadedPdfParser, RenderConfig, ThreadedPdfParserConfig, @@ -27,13 +27,12 @@ LARGE_SAMPLE_PDF = "docs/PDF32000_2008.pdf" -def _make_decode_config() -> DecodePageConfig: - config = DecodePageConfig() - config.page_boundary = "crop_box" - config.do_sanitization = False - config.keep_glyphs = True - config.keep_qpdf_warnings = False - return config +def _make_decode_config() -> DecodeConfig: + return DecodeConfig( + do_sanitization=False, + keep_glyphs=True, + keep_qpdf_warnings=False, + ) def _make_render_config() -> RenderConfig: