From 7f2818958eac40750bf9740fc1863c0b913553cd Mon Sep 17 00:00:00 2001 From: Seongmin Cho Date: Tue, 9 Jun 2026 10:54:46 +0900 Subject: [PATCH] =?UTF-8?q?fix:=20prevent=20O(N=C2=B2)=20XObject=20reloadi?= =?UTF-8?q?ng=20in=20do=5Fform()=20causing=20OOM=20on=20complex=20PDFs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a PDF page contains many Form XObjects sharing the same resource dictionary (e.g. 2296 scatter plot data points each referencing 2301 XObjects), do_form() was calling set() on fonts, graphics states, and XObjects on every invocation — reloading all N resources N times, resulting in O(N²) memory and CPU usage that caused OOM kills. Fix: before calling set(), check whether all keys in the resource dictionary are already present in the parent chain. If so, skip the set() call entirely. This reduces the redundant reloading to O(1) per do_form() call when resources are already inherited. Fixes: https://github.com/docling-project/docling/issues/2109 Signed-off-by: Seongmin Cho --- src/parse/pdf_decoders/stream.h | 50 +++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/src/parse/pdf_decoders/stream.h b/src/parse/pdf_decoders/stream.h index 804fe23f..e46df24a 100644 --- a/src/parse/pdf_decoders/stream.h +++ b/src/parse/pdf_decoders/stream.h @@ -1,5 +1,6 @@ //-*-C++-*- + #ifndef PDF_STREAM_DECODER_H #define PDF_STREAM_DECODER_H @@ -385,19 +386,62 @@ namespace pdflib if(xobj.has_fonts()) { QPDFObjectHandle xobj_fonts = xobj.get_fonts(); - page_fonts_->set(xobj_fonts, timings); + // Skip set() if all fonts are already loaded in the parent chain to avoid O(N²) reloading + auto font_keys = xobj_fonts.getKeys(); + bool fonts_all_loaded = !font_keys.empty(); + for(auto& k : font_keys) + { + if(page_fonts_->count(k) == 0) + { + fonts_all_loaded = false; + break; + } + } + if(!fonts_all_loaded) + { + page_fonts_->set(xobj_fonts, timings); + } } if(xobj.has_grphs()) { QPDFObjectHandle xobj_grphs = xobj.get_grphs(); - page_grphs_->set(xobj_grphs, timings); + // Skip set() if all graphics states are already loaded in the parent chain to avoid O(N²) reloading + auto grph_keys = xobj_grphs.getKeys(); + bool grphs_all_loaded = !grph_keys.empty(); + for(auto& k : grph_keys) + { + if(page_grphs_->count(k) == 0) + { + grphs_all_loaded = false; + break; + } + } + if(!grphs_all_loaded) + { + page_grphs_->set(xobj_grphs, timings); + } } if(xobj.has_xobjects()) { QPDFObjectHandle xobj_xobjects = xobj.get_xobjects(); - page_xobjects_->set(xobj_xobjects, timings); + // Skip set() if all XObjects are already loaded in the parent chain. + // Prevents O(N²) reloading when do_form() is called N times and each call reloads all N resources. + auto keys = xobj_xobjects.getKeys(); + bool all_already_loaded = !keys.empty(); + for(auto& k : keys) + { + if(!page_xobjects_->has(k)) + { + all_already_loaded = false; + break; + } + } + if(!all_already_loaded) + { + page_xobjects_->set(xobj_xobjects, timings); + } } }