diff --git a/README.md b/README.md index 652afc057..6da3ee1d9 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ > [!IMPORTANT] > Breaking changes between 0.0.1 to 0.1.0: -> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior. +> * Dependencies are now organized into optional feature-groups (further details below). Use `pip install 'markitdown[all]'` to have backward-compatible behavior. > * convert\_stream() now requires a binary file-like object (e.g., a file opened in binary mode, or an io.BytesIO object). This is a breaking change from the previous version, where it previously also accepted text file-like objects, like io.StringIO. > * The DocumentConverter class interface has changed to read from file-like streams rather than file paths. *No temporary files are created anymore*. If you are the maintainer of a plugin, or custom DocumentConverter, you likely need to update your code. Otherwise, if only using the MarkItDown class or CLI (as in these examples), you should not need to change anything. @@ -132,6 +132,38 @@ markitdown --use-plugins path-to-file.pdf To find available plugins, search GitHub for the hashtag `#markitdown-plugin`. To develop a plugin, see `packages/markitdown-sample-plugin`. +#### markitdown-ocr Plugin + +The `markitdown-ocr` plugin adds OCR support to PDF, DOCX, PPTX, and XLSX converters, extracting text from embedded images using LLM Vision — the same `llm_client` / `llm_model` pattern that MarkItDown already uses for image descriptions. No new ML libraries or binary dependencies required. + +**Installation:** + +```bash +pip install markitdown-ocr +pip install openai # or any OpenAI-compatible client +``` + +**Usage:** + +Pass the same `llm_client` and `llm_model` you would use for image descriptions: + +```python +from markitdown import MarkItDown +from openai import OpenAI + +md = MarkItDown( + enable_plugins=True, + llm_client=OpenAI(), + llm_model="gpt-4o", +) +result = md.convert("document_with_images.pdf") +print(result.text_content) +``` + +If no `llm_client` is provided the plugin still loads, but OCR is silently skipped and the standard built-in converter is used instead. + +See [`packages/markitdown-ocr/README.md`](packages/markitdown-ocr/README.md) for detailed documentation. + ### Azure Document Intelligence To use Microsoft Document Intelligence for conversion: diff --git a/packages/markitdown-ocr/LICENSE b/packages/markitdown-ocr/LICENSE new file mode 100644 index 000000000..9e841e7a2 --- /dev/null +++ b/packages/markitdown-ocr/LICENSE @@ -0,0 +1,21 @@ + MIT License + + Copyright (c) Microsoft Corporation. + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE diff --git a/packages/markitdown-ocr/README.md b/packages/markitdown-ocr/README.md new file mode 100644 index 000000000..d0883db4a --- /dev/null +++ b/packages/markitdown-ocr/README.md @@ -0,0 +1,200 @@ +# MarkItDown OCR Plugin + +LLM Vision plugin for MarkItDown that extracts text from images embedded in PDF, DOCX, PPTX, and XLSX files. + +Uses the same `llm_client` / `llm_model` pattern that MarkItDown already supports for image descriptions — no new ML libraries or binary dependencies required. + +## Features + +- **Enhanced PDF Converter**: Extracts text from images within PDFs, with full-page OCR fallback for scanned documents +- **Enhanced DOCX Converter**: OCR for images in Word documents +- **Enhanced PPTX Converter**: OCR for images in PowerPoint presentations +- **Enhanced XLSX Converter**: OCR for images in Excel spreadsheets +- **Context Preservation**: Maintains document structure and flow when inserting extracted text + +## Installation + +```bash +pip install markitdown-ocr +``` + +The plugin uses whatever OpenAI-compatible client you already have. Install one if you don't have it yet: + +```bash +pip install openai +``` + +## Usage + +### Command Line + +```bash +markitdown document.pdf --use-plugins --llm-client openai --llm-model gpt-4o +``` + +### Python API + +Pass `llm_client` and `llm_model` to `MarkItDown()` exactly as you would for image descriptions: + +```python +from markitdown import MarkItDown +from openai import OpenAI + +md = MarkItDown( + enable_plugins=True, + llm_client=OpenAI(), + llm_model="gpt-4o", +) + +result = md.convert("document_with_images.pdf") +print(result.text_content) +``` + +If no `llm_client` is provided the plugin still loads, but OCR is silently skipped — falling back to the standard built-in converter. + +### Custom Prompt + +Override the default extraction prompt for specialized documents: + +```python +md = MarkItDown( + enable_plugins=True, + llm_client=OpenAI(), + llm_model="gpt-4o", + llm_prompt="Extract all text from this image, preserving table structure.", +) +``` + +### Any OpenAI-Compatible Client + +Works with any client that follows the OpenAI API: + +```python +from openai import AzureOpenAI + +md = MarkItDown( + enable_plugins=True, + llm_client=AzureOpenAI( + api_key="...", + azure_endpoint="https://your-resource.openai.azure.com/", + api_version="2024-02-01", + ), + llm_model="gpt-4o", +) +``` + +## How It Works + +When `MarkItDown(enable_plugins=True, llm_client=..., llm_model=...)` is called: + +1. MarkItDown discovers the plugin via the `markitdown.plugin` entry point group +2. It calls `register_converters()`, forwarding all kwargs including `llm_client` and `llm_model` +3. The plugin creates an `LLMVisionOCRService` from those kwargs +4. Four OCR-enhanced converters are registered at **priority -1.0** — before the built-in converters at priority 0.0 + +When a file is converted: + +1. The OCR converter accepts the file +2. It extracts embedded images from the document +3. Each image is sent to the LLM with an extraction prompt +4. The returned text is inserted inline, preserving document structure +5. If the LLM call fails, conversion continues without that image's text + +## Supported File Formats + +### PDF + +- Embedded images are extracted by position (via `page.images` / page XObjects) and OCR'd inline, interleaved with the surrounding text in vertical reading order. +- **Scanned PDFs** (pages with no extractable text) are detected automatically: each page is rendered at 300 DPI and sent to the LLM as a full-page image. +- **Malformed PDFs** that pdfplumber/pdfminer cannot open (e.g. truncated EOF) are retried with PyMuPDF page rendering, so content is still recovered. + +### DOCX + +- Images are extracted via document part relationships (`doc.part.rels`). +- OCR is run before the DOCX→HTML→Markdown pipeline executes: placeholder tokens are injected into the HTML so that the markdown converter does not escape the OCR markers, and the final placeholders are replaced with the formatted `*[Image OCR]...[End OCR]*` blocks after conversion. +- Document flow (headings, paragraphs, tables) is fully preserved around the OCR blocks. + +### PPTX + +- Picture shapes, placeholder shapes with images, and images inside groups are all supported. +- Shapes are processed in top-to-left reading order per slide. +- If an `llm_client` is configured, the LLM is asked for a description first; OCR is used as the fallback when no description is returned. + +### XLSX + +- Images embedded in worksheets (`sheet._images`) are extracted per sheet. +- Cell position is calculated from the image anchor coordinates (column/row → Excel letter notation). +- Images are listed under a `### Images in this sheet:` section after the sheet's data table — they are not interleaved into the table rows. + +### Output format + +Every extracted OCR block is wrapped as: + +```text +*[Image OCR] + +[End OCR]* +``` + +## Troubleshooting + +### OCR text missing from output + +The most likely cause is a missing `llm_client` or `llm_model`. Verify: + +```python +from openai import OpenAI +from markitdown import MarkItDown + +md = MarkItDown( + enable_plugins=True, + llm_client=OpenAI(), # required + llm_model="gpt-4o", # required +) +``` + +### Plugin not loading + +Confirm the plugin is installed and discovered: + +```bash +markitdown --list-plugins # should show: ocr +``` + +### API errors + +The plugin propagates LLM API errors as warnings and continues conversion. Check your API key, quota, and that the chosen model supports vision inputs. + +## Development + +### Running Tests + +```bash +cd packages/markitdown-ocr +pytest tests/ -v +``` + +### Building from Source + +```bash +git clone https://github.com/microsoft/markitdown.git +cd markitdown/packages/markitdown-ocr +pip install -e . +``` + +## Contributing + +Contributions are welcome! See the [MarkItDown repository](https://github.com/microsoft/markitdown) for guidelines. + +## License + +MIT — see [LICENSE](LICENSE). + +## Changelog + +### 0.1.0 (Initial Release) + +- LLM Vision OCR for PDF, DOCX, PPTX, XLSX +- Full-page OCR fallback for scanned PDFs +- Context-aware inline text insertion +- Priority-based converter replacement (no code changes required) diff --git a/packages/markitdown-ocr/pyproject.toml b/packages/markitdown-ocr/pyproject.toml new file mode 100644 index 000000000..eda3cdda5 --- /dev/null +++ b/packages/markitdown-ocr/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "markitdown-ocr" +dynamic = ["version"] +description = 'OCR plugin for MarkItDown - Extracts text from images in PDF, DOCX, PPTX, and XLSX via LLM Vision' +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +keywords = ["markitdown", "ocr", "pdf", "docx", "xlsx", "pptx", "llm", "vision"] +authors = [ + { name = "Contributors", email = "noreply@github.com" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", +] + +# Core dependencies — matches the file-format libraries markitdown already uses +dependencies = [ + "markitdown>=0.1.0", + "pdfminer.six>=20251230", + "pdfplumber>=0.11.9", + "PyMuPDF>=1.24.0", + "mammoth~=1.11.0", + "python-docx", + "python-pptx", + "pandas", + "openpyxl", + "Pillow>=9.0.0", +] + +# llm_client is passed in by the user (same as for markitdown image descriptions); +# install openai or any OpenAI-compatible SDK separately. +[project.optional-dependencies] +llm = [ + "openai>=1.0.0", +] + +[project.urls] +Documentation = "https://github.com/microsoft/markitdown#readme" +Issues = "https://github.com/microsoft/markitdown/issues" +Source = "https://github.com/microsoft/markitdown" + +[tool.hatch.version] +path = "src/markitdown_ocr/__about__.py" + +# CRITICAL: Plugin entry point - MarkItDown will discover this plugin through this entry point +[project.entry-points."markitdown.plugin"] +ocr = "markitdown_ocr" diff --git a/packages/markitdown-ocr/src/markitdown_ocr/__about__.py b/packages/markitdown-ocr/src/markitdown_ocr/__about__.py new file mode 100644 index 000000000..1c700dc50 --- /dev/null +++ b/packages/markitdown-ocr/src/markitdown_ocr/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2025-present Contributors +# SPDX-License-Identifier: MIT + +__version__ = "0.1.0" diff --git a/packages/markitdown-ocr/src/markitdown_ocr/__init__.py b/packages/markitdown-ocr/src/markitdown_ocr/__init__.py new file mode 100644 index 000000000..f608e9625 --- /dev/null +++ b/packages/markitdown-ocr/src/markitdown_ocr/__init__.py @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: 2025-present Contributors +# SPDX-License-Identifier: MIT + +""" +markitdown-ocr: OCR plugin for MarkItDown + +Adds LLM Vision-based text extraction from images embedded in PDF, DOCX, PPTX, and XLSX files. +""" + +from ._plugin import __plugin_interface_version__, register_converters +from .__about__ import __version__ +from ._ocr_service import ( + OCRResult, + LLMVisionOCRService, +) +from ._pdf_converter_with_ocr import PdfConverterWithOCR +from ._docx_converter_with_ocr import DocxConverterWithOCR +from ._pptx_converter_with_ocr import PptxConverterWithOCR +from ._xlsx_converter_with_ocr import XlsxConverterWithOCR + +__all__ = [ + "__version__", + "__plugin_interface_version__", + "register_converters", + "OCRResult", + "LLMVisionOCRService", + "PdfConverterWithOCR", + "DocxConverterWithOCR", + "PptxConverterWithOCR", + "XlsxConverterWithOCR", +] diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_docx_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_docx_converter_with_ocr.py new file mode 100644 index 000000000..f2463de11 --- /dev/null +++ b/packages/markitdown-ocr/src/markitdown_ocr/_docx_converter_with_ocr.py @@ -0,0 +1,189 @@ +""" +Enhanced DOCX Converter with OCR support for embedded images. +Extracts images from Word documents and performs OCR while maintaining context. +""" + +import io +import re +import sys +from typing import Any, BinaryIO, Optional + +from markitdown.converters import HtmlConverter +from markitdown.converter_utils.docx.pre_process import pre_process_docx +from markitdown import DocumentConverterResult, StreamInfo +from markitdown._exceptions import ( + MissingDependencyException, + MISSING_DEPENDENCY_MESSAGE, +) +from ._ocr_service import LLMVisionOCRService + +# Try loading dependencies +_dependency_exc_info = None +try: + import mammoth + from docx import Document +except ImportError: + _dependency_exc_info = sys.exc_info() + +# Placeholder injected into HTML so that mammoth never sees the OCR markers. +# Must be a single token with no special markdown characters. +_PLACEHOLDER = "MARKITDOWNOCRBLOCK{}" + + +class DocxConverterWithOCR(HtmlConverter): + """ + Enhanced DOCX Converter with OCR support for embedded images. + Maintains document flow while extracting text from images inline. + """ + + def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None): + super().__init__() + self._html_converter = HtmlConverter() + self.ocr_service = ocr_service + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension == ".docx": + return True + + if mimetype.startswith( + "application/vnd.openxmlformats-officedocument.wordprocessingml" + ): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".docx", + feature="docx", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) # type: ignore[union-attr] + + # Get OCR service if available (from kwargs or instance) + ocr_service: Optional[LLMVisionOCRService] = ( + kwargs.get("ocr_service") or self.ocr_service + ) + + if ocr_service: + # 1. Extract and OCR images — returns raw text per image + file_stream.seek(0) + image_ocr_map = self._extract_and_ocr_images(file_stream, ocr_service) + + # 2. Convert DOCX → HTML via mammoth + file_stream.seek(0) + pre_process_stream = pre_process_docx(file_stream) + html_result = mammoth.convert_to_html( + pre_process_stream, style_map=kwargs.get("style_map") + ).value + + # 3. Replace tags with plain placeholder tokens so that + # mammoth's HTML→markdown step never escapes our OCR markers. + html_with_placeholders, ocr_texts = self._inject_placeholders( + html_result, image_ocr_map + ) + + # 4. Convert HTML → markdown + md_result = self._html_converter.convert_string( + html_with_placeholders, **kwargs + ) + md = md_result.markdown + + # 5. Swap placeholders for the actual OCR blocks (post-conversion + # so * and _ are never escaped by the markdown converter). + for i, raw_text in enumerate(ocr_texts): + placeholder = _PLACEHOLDER.format(i) + ocr_block = f"*[Image OCR]\n{raw_text}\n[End OCR]*" + md = md.replace(placeholder, ocr_block) + + return DocumentConverterResult(markdown=md) + else: + # Standard conversion without OCR + style_map = kwargs.get("style_map", None) + pre_process_stream = pre_process_docx(file_stream) + return self._html_converter.convert_string( + mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, + **kwargs, + ) + + def _extract_and_ocr_images( + self, file_stream: BinaryIO, ocr_service: LLMVisionOCRService + ) -> dict[str, str]: + """ + Extract images from DOCX and OCR them. + + Returns: + Dict mapping image relationship IDs to raw OCR text (no markers). + """ + ocr_map = {} + + try: + file_stream.seek(0) + doc = Document(file_stream) + + for rel in doc.part.rels.values(): + if "image" in rel.target_ref.lower(): + try: + image_bytes = rel.target_part.blob + image_stream = io.BytesIO(image_bytes) + ocr_result = ocr_service.extract_text(image_stream) + + if ocr_result.text.strip(): + # Store raw text only — markers added later + ocr_map[rel.rId] = ocr_result.text.strip() + + except Exception: + continue + + except Exception: + pass + + return ocr_map + + def _inject_placeholders( + self, html: str, ocr_map: dict[str, str] + ) -> tuple[str, list[str]]: + """ + Replace tags with numbered placeholder tokens. + + Returns: + (html_with_placeholders, ordered list of raw OCR texts) + """ + if not ocr_map: + return html, [] + + ocr_texts = list(ocr_map.values()) + used: list[int] = [] + + def replace_img(match: re.Match) -> str: # type: ignore[type-arg] + for i in range(len(ocr_texts)): + if i not in used: + used.append(i) + return f"

{_PLACEHOLDER.format(i)}

" + return "" # remove image if all OCR texts already used + + result = re.sub(r"]*>", replace_img, html) + + # Any OCR texts that had no matching tag go at the end + for i in range(len(ocr_texts)): + if i not in used: + result += f"

{_PLACEHOLDER.format(i)}

" + + return result, ocr_texts diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_ocr_service.py b/packages/markitdown-ocr/src/markitdown_ocr/_ocr_service.py new file mode 100644 index 000000000..2885e1f47 --- /dev/null +++ b/packages/markitdown-ocr/src/markitdown_ocr/_ocr_service.py @@ -0,0 +1,110 @@ +""" +OCR Service Layer for MarkItDown +Provides LLM Vision-based image text extraction. +""" + +import base64 +from typing import Any, BinaryIO +from dataclasses import dataclass + +from markitdown import StreamInfo + + +@dataclass +class OCRResult: + """Result from OCR extraction.""" + + text: str + confidence: float | None = None + backend_used: str | None = None + error: str | None = None + + +class LLMVisionOCRService: + """OCR service using LLM vision models (OpenAI-compatible).""" + + def __init__( + self, + client: Any, + model: str, + default_prompt: str | None = None, + ) -> None: + """ + Initialize LLM Vision OCR service. + + Args: + client: OpenAI-compatible client + model: Model name (e.g., 'gpt-4o', 'gemini-2.0-flash') + default_prompt: Default prompt for OCR extraction + """ + self.client = client + self.model = model + self.default_prompt = default_prompt or ( + "Extract all text from this image. " + "Return ONLY the extracted text, maintaining the original " + "layout and order. Do not add any commentary or description." + ) + + def extract_text( + self, + image_stream: BinaryIO, + prompt: str | None = None, + stream_info: StreamInfo | None = None, + **kwargs: Any, + ) -> OCRResult: + """Extract text using LLM vision.""" + if self.client is None: + return OCRResult( + text="", + backend_used="llm_vision", + error="LLM client not configured", + ) + + try: + image_stream.seek(0) + + content_type: str | None = None + if stream_info: + content_type = stream_info.mimetype + + if not content_type: + try: + from PIL import Image + + image_stream.seek(0) + img = Image.open(image_stream) + fmt = img.format.lower() if img.format else "png" + content_type = f"image/{fmt}" + except Exception: + content_type = "image/png" + + image_stream.seek(0) + base64_image = base64.b64encode(image_stream.read()).decode("utf-8") + data_uri = f"data:{content_type};base64,{base64_image}" + + actual_prompt = prompt or self.default_prompt + response = self.client.chat.completions.create( + model=self.model, + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": actual_prompt}, + { + "type": "image_url", + "image_url": {"url": data_uri}, + }, + ], + } + ], + ) + + text = response.choices[0].message.content + return OCRResult( + text=text.strip() if text else "", + backend_used="llm_vision", + ) + except Exception as e: + return OCRResult(text="", backend_used="llm_vision", error=str(e)) + finally: + image_stream.seek(0) diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py new file mode 100644 index 000000000..c1dc0f613 --- /dev/null +++ b/packages/markitdown-ocr/src/markitdown_ocr/_pdf_converter_with_ocr.py @@ -0,0 +1,422 @@ +""" +Enhanced PDF Converter with OCR support for embedded images. +Extracts images from PDFs and performs OCR while maintaining document context. +""" + +import io +import sys +from typing import Any, BinaryIO, Optional + +from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo +from markitdown._exceptions import ( + MissingDependencyException, + MISSING_DEPENDENCY_MESSAGE, +) +from ._ocr_service import LLMVisionOCRService + +# Import dependencies +_dependency_exc_info = None +try: + import pdfminer + import pdfminer.high_level + import pdfplumber + from PIL import Image +except ImportError: + _dependency_exc_info = sys.exc_info() + + +def _extract_images_from_page(page: Any) -> list[dict]: + """ + Extract images from a PDF page by rendering page regions. + + Returns: + List of dicts with 'stream', 'bbox', 'name', 'y_pos' keys + """ + images_info = [] + + try: + # Try multiple methods to detect images + images = [] + + # Method 1: Use page.images (standard approach) + if hasattr(page, "images") and page.images: + images = page.images + + # Method 2: If no images found, try underlying PDF objects + if not images and hasattr(page, "objects") and "image" in page.objects: + images = page.objects.get("image", []) + + # Method 3: Try filtering all objects for image types + if not images and hasattr(page, "objects"): + all_objs = page.objects + for obj_type in all_objs.keys(): + if "image" in obj_type.lower() or "xobject" in obj_type.lower(): + potential_imgs = all_objs.get(obj_type, []) + if potential_imgs: + images = potential_imgs + break + + for i, img_dict in enumerate(images): + try: + # Try to get the actual image stream from the PDF + img_stream = None + y_pos = 0 + + # Method A: If img_dict has 'stream' key, use it directly + if "stream" in img_dict and hasattr(img_dict["stream"], "get_data"): + try: + img_bytes = img_dict["stream"].get_data() + + # Try to open as PIL Image to validate/decode + pil_img = Image.open(io.BytesIO(img_bytes)) + + # Convert to RGB if needed (handle CMYK, etc.) + if pil_img.mode not in ("RGB", "L"): + pil_img = pil_img.convert("RGB") + + # Save to stream as PNG + img_stream = io.BytesIO() + pil_img.save(img_stream, format="PNG") + img_stream.seek(0) + + y_pos = img_dict.get("top", 0) + except Exception: + pass + + # Method B: Fallback to rendering page region + if img_stream is None: + x0 = img_dict.get("x0", 0) + y0 = img_dict.get("top", 0) + x1 = img_dict.get("x1", 0) + y1 = img_dict.get("bottom", 0) + y_pos = y0 + + # Check if dimensions are valid + if x1 <= x0 or y1 <= y0: + continue + + # Use pdfplumber's within_bbox to crop, then render + # This preserves coordinate system correctly + bbox = (x0, y0, x1, y1) + cropped_page = page.within_bbox(bbox) + + # Render at 150 DPI (balance between quality and size) + page_img = cropped_page.to_image(resolution=150) + + # Save to stream + img_stream = io.BytesIO() + page_img.original.save(img_stream, format="PNG") + img_stream.seek(0) + + if img_stream: + images_info.append( + { + "stream": img_stream, + "name": f"page_{page.page_number}_img_{i}", + "y_pos": y_pos, + } + ) + + except Exception: + continue + + except Exception: + pass + + return images_info + + +class PdfConverterWithOCR(DocumentConverter): + """ + Enhanced PDF Converter with OCR support for embedded images. + Maintains document structure while extracting text from images inline. + """ + + def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None): + super().__init__() + self.ocr_service = ocr_service + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension == ".pdf": + return True + + if mimetype.startswith("application/pdf") or mimetype.startswith( + "application/x-pdf" + ): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".pdf", + feature="pdf", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) # type: ignore[union-attr] + + # Get OCR service if available (from kwargs or instance) + ocr_service: LLMVisionOCRService | None = ( + kwargs.get("ocr_service") or self.ocr_service + ) + + # Read PDF into BytesIO + file_stream.seek(0) + pdf_bytes = io.BytesIO(file_stream.read()) + + markdown_content = [] + + try: + with pdfplumber.open(pdf_bytes) as pdf: + for page_num, page in enumerate(pdf.pages, 1): + markdown_content.append(f"\n## Page {page_num}\n") + + # If OCR is enabled, interleave text and images by position + if ocr_service: + images_on_page = self._extract_page_images(pdf_bytes, page_num) + + if images_on_page: + # Extract text lines with Y positions + chars = page.chars + if chars: + # Group chars into lines based on Y position + lines_with_y = [] + current_line = [] + current_y = None + + for char in sorted( + chars, key=lambda c: (c["top"], c["x0"]) + ): + y = char["top"] + if current_y is None: + current_y = y + elif abs(y - current_y) > 2: # New line threshold + if current_line: + text = "".join( + [c["text"] for c in current_line] + ) + lines_with_y.append( + {"y": current_y, "text": text.strip()} + ) + current_line = [] + current_y = y + current_line.append(char) + + # Add last line + if current_line: + text = "".join([c["text"] for c in current_line]) + lines_with_y.append( + {"y": current_y, "text": text.strip()} + ) + else: + # Fallback: use simple text extraction + text_content = page.extract_text() or "" + lines_with_y = [ + {"y": i * 10, "text": line} + for i, line in enumerate(text_content.split("\n")) + ] + + # OCR all images + image_data = [] + for img_info in images_on_page: + ocr_result = ocr_service.extract_text( + img_info["stream"] + ) + if ocr_result.text.strip(): + image_data.append( + { + "y_pos": img_info["y_pos"], + "name": img_info["name"], + "ocr_text": ocr_result.text, + "backend": ocr_result.backend_used, + "type": "image", + } + ) + + # Add text items + content_items = [ + { + "y_pos": item["y"], + "text": item["text"], + "type": "text", + } + for item in lines_with_y + if item["text"] + ] + content_items.extend(image_data) + + # Sort all items by Y position (top to bottom) + content_items.sort(key=lambda x: x["y_pos"]) + + # Build markdown by interleaving text and images + for item in content_items: + if item["type"] == "text": + markdown_content.append(item["text"]) + else: # image + ocr_text = item["ocr_text"] + img_marker = ( + f"\n\n*[Image OCR]\n{ocr_text}\n[End OCR]*\n" + ) + markdown_content.append(img_marker) + else: + # No images detected - just extract regular text + text_content = page.extract_text() or "" + if text_content.strip(): + markdown_content.append(text_content.strip()) + else: + # No OCR, just extract text + text_content = page.extract_text() or "" + if text_content.strip(): + markdown_content.append(text_content.strip()) + + # Build final markdown + markdown = "\n\n".join(markdown_content).strip() + + # Fallback to pdfminer if empty + if not markdown: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + + except Exception: + # Fallback to pdfminer + try: + pdf_bytes.seek(0) + markdown = pdfminer.high_level.extract_text(pdf_bytes) + except Exception: + markdown = "" + + # Final fallback: If still empty/whitespace and OCR is available, + # treat as scanned PDF and OCR full pages + if ocr_service and (not markdown or not markdown.strip()): + pdf_bytes.seek(0) + markdown = self._ocr_full_pages(pdf_bytes, ocr_service) + + return DocumentConverterResult(markdown=markdown) + + def _extract_page_images(self, pdf_bytes: io.BytesIO, page_num: int) -> list[dict]: + """ + Extract images from a PDF page using pdfplumber. + + Args: + pdf_bytes: PDF file as BytesIO + page_num: Page number (1-indexed) + + Returns: + List of image info dicts with 'stream', 'bbox', 'name', 'y_pos' + """ + images = [] + + try: + pdf_bytes.seek(0) + with pdfplumber.open(pdf_bytes) as pdf: + if page_num <= len(pdf.pages): + page = pdf.pages[page_num - 1] # 0-indexed + images = _extract_images_from_page(page) + except Exception: + pass + + # Sort by vertical position (top to bottom) + images.sort(key=lambda x: x["y_pos"]) + + return images + + def _ocr_full_pages( + self, pdf_bytes: io.BytesIO, ocr_service: LLMVisionOCRService + ) -> str: + """ + Fallback for scanned PDFs: Convert entire pages to images and OCR them. + Used when text extraction returns empty/whitespace results. + + Args: + pdf_bytes: PDF file as BytesIO + ocr_service: OCR service to use + + Returns: + Markdown text extracted from OCR of full pages + """ + markdown_parts = [] + + try: + pdf_bytes.seek(0) + with pdfplumber.open(pdf_bytes) as pdf: + for page_num, page in enumerate(pdf.pages, 1): + try: + markdown_parts.append(f"\n## Page {page_num}\n") + + # Render page to image + page_img = page.to_image(resolution=300) + img_stream = io.BytesIO() + page_img.original.save(img_stream, format="PNG") + img_stream.seek(0) + + # Run OCR + ocr_result = ocr_service.extract_text(img_stream) + + if ocr_result.text.strip(): + text = ocr_result.text.strip() + markdown_parts.append(f"*[Image OCR]\n{text}\n[End OCR]*") + else: + markdown_parts.append( + "*[No text could be extracted from this page]*" + ) + + except Exception as e: + markdown_parts.append( + f"*[Error processing page {page_num}: {str(e)}]*" + ) + continue + + except Exception: + # pdfplumber failed (e.g. malformed EOF) — try PyMuPDF for rendering + markdown_parts = [] + try: + import fitz # PyMuPDF + + pdf_bytes.seek(0) + doc = fitz.open(stream=pdf_bytes.read(), filetype="pdf") + for page_num in range(1, doc.page_count + 1): + try: + markdown_parts.append(f"\n## Page {page_num}\n") + page = doc[page_num - 1] + mat = fitz.Matrix(300 / 72, 300 / 72) # 300 DPI + pix = page.get_pixmap(matrix=mat) + img_stream = io.BytesIO(pix.tobytes("png")) + img_stream.seek(0) + + ocr_result = ocr_service.extract_text(img_stream) + + if ocr_result.text.strip(): + text = ocr_result.text.strip() + markdown_parts.append(f"*[Image OCR]\n{text}\n[End OCR]*") + else: + markdown_parts.append( + "*[No text could be extracted from this page]*" + ) + + except Exception as e: + markdown_parts.append( + f"*[Error processing page {page_num}: {str(e)}]*" + ) + continue + doc.close() + except Exception: + return "*[Error: Could not process scanned PDF]*" + + return "\n\n".join(markdown_parts).strip() diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_plugin.py b/packages/markitdown-ocr/src/markitdown_ocr/_plugin.py new file mode 100644 index 000000000..f4d7bcf5a --- /dev/null +++ b/packages/markitdown-ocr/src/markitdown_ocr/_plugin.py @@ -0,0 +1,68 @@ +""" +Plugin registration for markitdown-ocr. +Registers OCR-enhanced converters with priority-based replacement strategy. +""" + +from typing import Any +from markitdown import MarkItDown + +from ._ocr_service import LLMVisionOCRService +from ._pdf_converter_with_ocr import PdfConverterWithOCR +from ._docx_converter_with_ocr import DocxConverterWithOCR +from ._pptx_converter_with_ocr import PptxConverterWithOCR +from ._xlsx_converter_with_ocr import XlsxConverterWithOCR + + +__plugin_interface_version__ = 1 + + +def register_converters(markitdown: MarkItDown, **kwargs: Any) -> None: + """ + Register OCR-enhanced converters with MarkItDown. + + This plugin provides OCR support for PDF, DOCX, PPTX, and XLSX files. + The converters are registered with priority -1.0 to run BEFORE built-in + converters (which have priority 0.0), effectively replacing them when + the plugin is enabled. + + Args: + markitdown: MarkItDown instance to register converters with + **kwargs: Additional keyword arguments that may include: + - llm_client: OpenAI-compatible client for LLM-based OCR (required for OCR to work) + - llm_model: Model name (e.g., 'gpt-4o') + - llm_prompt: Custom prompt for text extraction + """ + # Create OCR service — reads the same llm_client/llm_model kwargs + # that MarkItDown itself already accepts for image descriptions + llm_client = kwargs.get("llm_client") + llm_model = kwargs.get("llm_model") + llm_prompt = kwargs.get("llm_prompt") + + ocr_service: LLMVisionOCRService | None = None + if llm_client and llm_model: + ocr_service = LLMVisionOCRService( + client=llm_client, + model=llm_model, + default_prompt=llm_prompt, + ) + + # Register converters with priority -1.0 (before built-ins at 0.0) + # This effectively "replaces" the built-in converters when plugin is installed + # Pass the OCR service to each converter's constructor + PRIORITY_OCR_ENHANCED = -1.0 + + markitdown.register_converter( + PdfConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED + ) + + markitdown.register_converter( + DocxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED + ) + + markitdown.register_converter( + PptxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED + ) + + markitdown.register_converter( + XlsxConverterWithOCR(ocr_service=ocr_service), priority=PRIORITY_OCR_ENHANCED + ) diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py new file mode 100644 index 000000000..7e91ed6b4 --- /dev/null +++ b/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py @@ -0,0 +1,249 @@ +""" +Enhanced PPTX Converter with improved OCR support. +Already has LLM-based image description, this enhances it with traditional OCR fallback. +""" + +import io +import sys +from typing import Any, BinaryIO, Optional + +from typing import BinaryIO, Any, Optional + +from markitdown.converters import HtmlConverter +from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo +from markitdown._exceptions import ( + MissingDependencyException, + MISSING_DEPENDENCY_MESSAGE, +) +from ._ocr_service import LLMVisionOCRService + +_dependency_exc_info = None +try: + import pptx +except ImportError: + _dependency_exc_info = sys.exc_info() + + +class PptxConverterWithOCR(DocumentConverter): + """Enhanced PPTX Converter with OCR fallback.""" + + def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None): + super().__init__() + self._html_converter = HtmlConverter() + self.ocr_service = ocr_service + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension == ".pptx": + return True + + if mimetype.startswith( + "application/vnd.openxmlformats-officedocument.presentationml" + ): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".pptx", + feature="pptx", + ) + ) from _dependency_exc_info[1].with_traceback( + _dependency_exc_info[2] + ) # type: ignore[union-attr] + + # Get OCR service (from kwargs or instance) + ocr_service: Optional[LLMVisionOCRService] = ( + kwargs.get("ocr_service") or self.ocr_service + ) + llm_client = kwargs.get("llm_client") + + presentation = pptx.Presentation(file_stream) + md_content = "" + slide_num = 0 + + for slide in presentation.slides: + slide_num += 1 + md_content += f"\\n\\n\\n" + + title = slide.shapes.title + + def get_shape_content(shape, **kwargs): + nonlocal md_content + + # Pictures + if self._is_picture(shape): + # Get image data + image_stream = io.BytesIO(shape.image.blob) + + # Try LLM description first if available + llm_description = "" + if llm_client and kwargs.get("llm_model"): + try: + from ._llm_caption import llm_caption + + image_filename = shape.image.filename + image_extension = None + if image_filename: + import os + + image_extension = os.path.splitext(image_filename)[1] + + image_stream_info = StreamInfo( + mimetype=shape.image.content_type, + extension=image_extension, + filename=image_filename, + ) + + llm_description = llm_caption( + image_stream, + image_stream_info, + client=llm_client, + model=kwargs.get("llm_model"), + prompt=kwargs.get("llm_prompt"), + ) + except Exception: + pass + + # Try OCR if LLM failed or not available + ocr_text = "" + if not llm_description and ocr_service: + try: + image_stream.seek(0) + ocr_result = ocr_service.extract_text(image_stream) + if ocr_result.text.strip(): + ocr_text = ocr_result.text.strip() + except Exception: + pass + + # Format extracted content using unified OCR block format + content = (llm_description or ocr_text or "").strip() + if content: + md_content += f"\n*[Image OCR]\n{content}\n[End OCR]*\n" + + # Tables + if self._is_table(shape): + md_content += self._convert_table_to_markdown(shape.table, **kwargs) + + # Charts + if shape.has_chart: + md_content += self._convert_chart_to_markdown(shape.chart) + + # Text areas + elif shape.has_text_frame: + if shape == title: + md_content += "# " + shape.text.lstrip() + "\\n" + else: + md_content += shape.text + "\\n" + + # Group Shapes + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: + sorted_shapes = sorted( + shape.shapes, + key=lambda x: ( + float("-inf") if not x.top else x.top, + float("-inf") if not x.left else x.left, + ), + ) + for subshape in sorted_shapes: + get_shape_content(subshape, **kwargs) + + sorted_shapes = sorted( + slide.shapes, + key=lambda x: ( + float("-inf") if not x.top else x.top, + float("-inf") if not x.left else x.left, + ), + ) + for shape in sorted_shapes: + get_shape_content(shape, **kwargs) + + md_content = md_content.strip() + + if slide.has_notes_slide: + md_content += "\\n\\n### Notes:\\n" + notes_frame = slide.notes_slide.notes_text_frame + if notes_frame is not None: + md_content += notes_frame.text + md_content = md_content.strip() + + return DocumentConverterResult(markdown=md_content.strip()) + + def _is_picture(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE: + return True + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER: + if hasattr(shape, "image"): + return True + return False + + def _is_table(self, shape): + if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE: + return True + return False + + def _convert_table_to_markdown(self, table, **kwargs): + import html + + html_table = "" + first_row = True + for row in table.rows: + html_table += "" + for cell in row.cells: + if first_row: + html_table += "" + else: + html_table += "" + html_table += "" + first_row = False + html_table += "
" + html.escape(cell.text) + "" + html.escape(cell.text) + "
" + + return ( + self._html_converter.convert_string(html_table, **kwargs).markdown.strip() + + "\\n" + ) + + def _convert_chart_to_markdown(self, chart): + try: + md = "\\n\\n### Chart" + if chart.has_title: + md += f": {chart.chart_title.text_frame.text}" + md += "\\n\\n" + data = [] + category_names = [c.label for c in chart.plots[0].categories] + series_names = [s.name for s in chart.series] + data.append(["Category"] + series_names) + + for idx, category in enumerate(category_names): + row = [category] + for series in chart.series: + row.append(series.values[idx]) + data.append(row) + + markdown_table = [] + for row in data: + markdown_table.append("| " + " | ".join(map(str, row)) + " |") + header = markdown_table[0] + separator = "|" + "|".join(["---"] * len(data[0])) + "|" + return md + "\\n".join([header, separator] + markdown_table[1:]) + except ValueError as e: + if "unsupported plot type" in str(e): + return "\\n\\n[unsupported chart]\\n\\n" + except Exception: + return "\\n\\n[unsupported chart]\\n\\n" diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_xlsx_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_xlsx_converter_with_ocr.py new file mode 100644 index 000000000..481e07195 --- /dev/null +++ b/packages/markitdown-ocr/src/markitdown_ocr/_xlsx_converter_with_ocr.py @@ -0,0 +1,225 @@ +""" +Enhanced XLSX Converter with OCR support for embedded images. +Extracts images from Excel spreadsheets and performs OCR while maintaining cell context. +""" + +import io +import sys +from typing import Any, BinaryIO, Optional + +from markitdown.converters import HtmlConverter +from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo +from markitdown._exceptions import ( + MissingDependencyException, + MISSING_DEPENDENCY_MESSAGE, +) +from ._ocr_service import LLMVisionOCRService + +# Try loading dependencies +_xlsx_dependency_exc_info = None +try: + import pandas as pd + from openpyxl import load_workbook +except ImportError: + _xlsx_dependency_exc_info = sys.exc_info() + + +class XlsxConverterWithOCR(DocumentConverter): + """ + Enhanced XLSX Converter with OCR support for embedded images. + Extracts images with their cell positions and performs OCR. + """ + + def __init__(self, ocr_service: Optional[LLMVisionOCRService] = None): + super().__init__() + self._html_converter = HtmlConverter() + self.ocr_service = ocr_service + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + if extension == ".xlsx": + return True + + if mimetype.startswith( + "application/vnd.openxmlformats-officedocument.spreadsheetml" + ): + return True + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + if _xlsx_dependency_exc_info is not None: + raise MissingDependencyException( + MISSING_DEPENDENCY_MESSAGE.format( + converter=type(self).__name__, + extension=".xlsx", + feature="xlsx", + ) + ) from _xlsx_dependency_exc_info[1].with_traceback( + _xlsx_dependency_exc_info[2] + ) # type: ignore[union-attr] + + # Get OCR service if available (from kwargs or instance) + ocr_service: Optional[LLMVisionOCRService] = ( + kwargs.get("ocr_service") or self.ocr_service + ) + + if ocr_service: + # Remove ocr_service from kwargs to avoid duplicate argument error + kwargs_without_ocr = {k: v for k, v in kwargs.items() if k != "ocr_service"} + return self._convert_with_ocr( + file_stream, ocr_service, **kwargs_without_ocr + ) + else: + return self._convert_standard(file_stream, **kwargs) + + def _convert_standard( + self, file_stream: BinaryIO, **kwargs: Any + ) -> DocumentConverterResult: + """Standard conversion without OCR.""" + file_stream.seek(0) + sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + md_content = "" + + for sheet_name in sheets: + md_content += f"## {sheet_name}\n" + html_content = sheets[sheet_name].to_html(index=False) + md_content += ( + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + + "\n\n" + ) + + return DocumentConverterResult(markdown=md_content.strip()) + + def _convert_with_ocr( + self, file_stream: BinaryIO, ocr_service: LLMVisionOCRService, **kwargs: Any + ) -> DocumentConverterResult: + """Convert XLSX with image OCR.""" + file_stream.seek(0) + wb = load_workbook(file_stream) + + md_content = "" + + for sheet_name in wb.sheetnames: + sheet = wb[sheet_name] + md_content += f"## {sheet_name}\n\n" + + # Convert sheet data to markdown table + file_stream.seek(0) + try: + df = pd.read_excel( + file_stream, sheet_name=sheet_name, engine="openpyxl" + ) + html_content = df.to_html(index=False) + md_content += ( + self._html_converter.convert_string( + html_content, **kwargs + ).markdown.strip() + + "\n\n" + ) + except Exception: + # If pandas fails, just skip the table + pass + + # Extract and OCR images in this sheet + images_with_ocr = self._extract_and_ocr_sheet_images(sheet, ocr_service) + + if images_with_ocr: + md_content += "### Images in this sheet:\n\n" + for img_info in images_with_ocr: + ocr_text = img_info["ocr_text"] + md_content += f"*[Image OCR]\n{ocr_text}\n[End OCR]*\n\n" + + return DocumentConverterResult(markdown=md_content.strip()) + + def _extract_and_ocr_sheet_images( + self, sheet: Any, ocr_service: LLMVisionOCRService + ) -> list[dict]: + """ + Extract and OCR images from an Excel sheet. + + Args: + sheet: openpyxl worksheet + ocr_service: OCR service + + Returns: + List of dicts with 'cell_ref' and 'ocr_text' + """ + results = [] + + try: + # Check if sheet has images + if hasattr(sheet, "_images"): + for img in sheet._images: + try: + # Get image data + if hasattr(img, "_data"): + image_data = img._data() + elif hasattr(img, "image"): + # Some versions store it differently + image_data = img.image + else: + continue + + # Create image stream + image_stream = io.BytesIO(image_data) + + # Get cell reference + cell_ref = "unknown" + if hasattr(img, "anchor"): + anchor = img.anchor + if hasattr(anchor, "_from"): + from_cell = anchor._from + if hasattr(from_cell, "col") and hasattr( + from_cell, "row" + ): + # Convert column number to letter + col_letter = self._column_number_to_letter( + from_cell.col + ) + cell_ref = f"{col_letter}{from_cell.row + 1}" + + # Perform OCR + ocr_result = ocr_service.extract_text(image_stream) + + if ocr_result.text.strip(): + results.append( + { + "cell_ref": cell_ref, + "ocr_text": ocr_result.text.strip(), + "backend": ocr_result.backend_used, + } + ) + + except Exception: + continue + + except Exception: + pass + + return results + + @staticmethod + def _column_number_to_letter(n: int) -> str: + """Convert column number to Excel column letter (0-indexed).""" + result = "" + n = n + 1 # Make 1-indexed + while n > 0: + n -= 1 + result = chr(65 + (n % 26)) + result + n //= 26 + return result diff --git a/packages/markitdown-ocr/tests/__init__.py b/packages/markitdown-ocr/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/packages/markitdown-ocr/tests/ocr_test_data/docx_complex_layout.docx b/packages/markitdown-ocr/tests/ocr_test_data/docx_complex_layout.docx new file mode 100644 index 000000000..4ddd69746 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/docx_complex_layout.docx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/docx_image_end.docx b/packages/markitdown-ocr/tests/ocr_test_data/docx_image_end.docx new file mode 100644 index 000000000..f2a9a8694 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/docx_image_end.docx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/docx_image_middle.docx b/packages/markitdown-ocr/tests/ocr_test_data/docx_image_middle.docx new file mode 100644 index 000000000..200f3c6c7 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/docx_image_middle.docx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/docx_image_start.docx b/packages/markitdown-ocr/tests/ocr_test_data/docx_image_start.docx new file mode 100644 index 000000000..7855bd166 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/docx_image_start.docx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/docx_multipage.docx b/packages/markitdown-ocr/tests/ocr_test_data/docx_multipage.docx new file mode 100644 index 000000000..c698b0fa2 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/docx_multipage.docx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/docx_multiple_images.docx b/packages/markitdown-ocr/tests/ocr_test_data/docx_multiple_images.docx new file mode 100644 index 000000000..790ce0bcb Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/docx_multiple_images.docx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_complex_layout.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_complex_layout.pdf new file mode 100644 index 000000000..f843ab891 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_complex_layout.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_image_end.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_image_end.pdf new file mode 100644 index 000000000..8b020edf6 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_image_end.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_image_middle.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_image_middle.pdf new file mode 100644 index 000000000..d90bc9d3e Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_image_middle.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_image_start.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_image_start.pdf new file mode 100644 index 000000000..0b57b7f96 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_image_start.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_multipage.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_multipage.pdf new file mode 100644 index 000000000..71ffe8d83 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_multipage.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_multiple_images.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_multiple_images.pdf new file mode 100644 index 000000000..8a5e47416 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_multiple_images.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_invoice.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_invoice.pdf new file mode 100644 index 000000000..5e1caacc5 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_invoice.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_meeting_minutes.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_meeting_minutes.pdf new file mode 100644 index 000000000..33c717bed Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_meeting_minutes.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_minimal.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_minimal.pdf new file mode 100644 index 000000000..9410339e3 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_minimal.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_report.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_report.pdf new file mode 100644 index 000000000..4c2112ff7 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_report.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_sales_report.pdf b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_sales_report.pdf new file mode 100644 index 000000000..178c63826 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pdf_scanned_sales_report.pdf differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pptx_complex_layout.pptx b/packages/markitdown-ocr/tests/ocr_test_data/pptx_complex_layout.pptx new file mode 100644 index 000000000..10467ea0e Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pptx_complex_layout.pptx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pptx_image_end.pptx b/packages/markitdown-ocr/tests/ocr_test_data/pptx_image_end.pptx new file mode 100644 index 000000000..1ed9804cd Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pptx_image_end.pptx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pptx_image_middle.pptx b/packages/markitdown-ocr/tests/ocr_test_data/pptx_image_middle.pptx new file mode 100644 index 000000000..315586a23 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pptx_image_middle.pptx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pptx_image_start.pptx b/packages/markitdown-ocr/tests/ocr_test_data/pptx_image_start.pptx new file mode 100644 index 000000000..32a50aa8c Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pptx_image_start.pptx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/pptx_multiple_images.pptx b/packages/markitdown-ocr/tests/ocr_test_data/pptx_multiple_images.pptx new file mode 100644 index 000000000..a8eaa4dee Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/pptx_multiple_images.pptx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/xlsx_complex_layout.xlsx b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_complex_layout.xlsx new file mode 100644 index 000000000..6052c1e30 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_complex_layout.xlsx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/xlsx_image_end.xlsx b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_image_end.xlsx new file mode 100644 index 000000000..3e26b33fd Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_image_end.xlsx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/xlsx_image_middle.xlsx b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_image_middle.xlsx new file mode 100644 index 000000000..2a6c91b77 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_image_middle.xlsx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/xlsx_image_start.xlsx b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_image_start.xlsx new file mode 100644 index 000000000..9e461821a Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_image_start.xlsx differ diff --git a/packages/markitdown-ocr/tests/ocr_test_data/xlsx_multiple_images.xlsx b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_multiple_images.xlsx new file mode 100644 index 000000000..eb8d0cfe6 Binary files /dev/null and b/packages/markitdown-ocr/tests/ocr_test_data/xlsx_multiple_images.xlsx differ diff --git a/packages/markitdown-ocr/tests/test_docx_converter.py b/packages/markitdown-ocr/tests/test_docx_converter.py new file mode 100644 index 000000000..0fb666504 --- /dev/null +++ b/packages/markitdown-ocr/tests/test_docx_converter.py @@ -0,0 +1,223 @@ +""" +Unit tests for DocxConverterWithOCR. + +For each DOCX test file: convert with a mock OCR service then compare the +full output string against the expected snapshot. + +OCR block format used by the converter: + *[Image OCR] + MOCK_OCR_TEXT_12345 + [End OCR]* +""" + +import sys +from pathlib import Path +from typing import Any + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from markitdown_ocr._ocr_service import OCRResult # noqa: E402 +from markitdown_ocr._docx_converter_with_ocr import ( # noqa: E402 + DocxConverterWithOCR, +) +from markitdown import StreamInfo # noqa: E402 + +TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data" + +_MOCK_TEXT = "MOCK_OCR_TEXT_12345" + + +class MockOCRService: + def extract_text( # noqa: ANN101 + self, image_stream: Any, **kwargs: Any + ) -> OCRResult: + return OCRResult(text=_MOCK_TEXT, backend_used="mock") + + +@pytest.fixture(scope="module") +def svc() -> MockOCRService: + return MockOCRService() + + +def _convert(filename: str, ocr_service: MockOCRService) -> str: + path = TEST_DATA_DIR / filename + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + converter = DocxConverterWithOCR() + with open(path, "rb") as f: + return converter.convert( + f, StreamInfo(extension=".docx"), ocr_service=ocr_service + ).text_content + + +# --------------------------------------------------------------------------- +# docx_image_start.docx +# --------------------------------------------------------------------------- + + +def test_docx_image_start(svc: MockOCRService) -> None: + expected = ( + "Document with Image at Start\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "This is the main content after the header image.\n\n" + "More text content here." + ) + assert _convert("docx_image_start.docx", svc) == expected + + +# --------------------------------------------------------------------------- +# docx_image_middle.docx +# --------------------------------------------------------------------------- + + +def test_docx_image_middle(svc: MockOCRService) -> None: + expected = ( + "# Introduction\n\n" + "This is the introduction section.\n\n" + "We will see an image below.\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "# Analysis\n\n" + "This section comes after the image." + ) + assert _convert("docx_image_middle.docx", svc) == expected + + +# --------------------------------------------------------------------------- +# docx_image_end.docx +# --------------------------------------------------------------------------- + + +def test_docx_image_end(svc: MockOCRService) -> None: + expected = ( + "Report\n\n" + "Main findings of the report.\n\n" + "Details and analysis.\n\n" + "Recommendations.\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("docx_image_end.docx", svc) == expected + + +# --------------------------------------------------------------------------- +# docx_multiple_images.docx +# --------------------------------------------------------------------------- + + +def test_docx_multiple_images(svc: MockOCRService) -> None: + expected = ( + "Multi-Image Document\n\n" + "First section\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "Second section with another image\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "Conclusion" + ) + assert _convert("docx_multiple_images.docx", svc) == expected + + +# --------------------------------------------------------------------------- +# docx_multipage.docx +# --------------------------------------------------------------------------- + + +def test_docx_multipage(svc: MockOCRService) -> None: + expected = ( + "# Page 1 - Mixed Content\n\n" + "This is the first paragraph on page 1.\n\n" + "BEFORE IMAGE: Important content appears here.\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "AFTER IMAGE: This content follows the image.\n\n" + "More text on page 1.\n\n" + "# Page 2 - Image at End\n\n" + "Content on page 2.\n\n" + "Multiple paragraphs of text.\n\n" + "Building up to the image...\n\n" + "Final paragraph before image.\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "# Page 3 - Image at Start\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "Content that follows the header image.\n\n" + "AFTER IMAGE: This text is after the image." + ) + assert _convert("docx_multipage.docx", svc) == expected + + +# --------------------------------------------------------------------------- +# docx_complex_layout.docx +# --------------------------------------------------------------------------- + + +def test_docx_complex_layout(svc: MockOCRService) -> None: + expected = ( + "Complex Document\n\n" + "| | |\n" + "| --- | --- |\n" + "| Feature | Status |\n" + "| Authentication | Active |\n" + "| Encryption | Enabled |\n\n" + "Security notice:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("docx_complex_layout.docx", svc) == expected + + +# --------------------------------------------------------------------------- +# _inject_placeholders — internal unit tests (no file I/O) +# --------------------------------------------------------------------------- + + +def test_inject_placeholders_single_image() -> None: + converter = DocxConverterWithOCR() + html = "

Before

After

" + result_html, texts = converter._inject_placeholders(html, {"rId1": "TEXT"}) + assert " None: + converter = DocxConverterWithOCR() + html = "

Mid

" + result_html, texts = converter._inject_placeholders( + html, {"rId1": "FIRST", "rId2": "SECOND"} + ) + assert "MARKITDOWNOCRBLOCK0" in result_html + assert "MARKITDOWNOCRBLOCK1" in result_html + assert result_html.index("MARKITDOWNOCRBLOCK0") < result_html.index( + "MARKITDOWNOCRBLOCK1" + ) + assert len(texts) == 2 + + +def test_inject_placeholders_no_img_tag_appends_at_end() -> None: + converter = DocxConverterWithOCR() + html = "

No images

" + result_html, texts = converter._inject_placeholders(html, {"rId1": "ORPHAN"}) + assert "MARKITDOWNOCRBLOCK0" in result_html + assert texts == ["ORPHAN"] + + +def test_inject_placeholders_empty_map_leaves_html_unchanged() -> None: + converter = DocxConverterWithOCR() + html = "

Content

" + result_html, texts = converter._inject_placeholders(html, {}) + assert result_html == html + assert texts == [] + + +# --------------------------------------------------------------------------- +# No OCR service — no OCR tags emitted +# --------------------------------------------------------------------------- + + +def test_docx_no_ocr_service_no_tags() -> None: + path = TEST_DATA_DIR / "docx_image_middle.docx" + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + converter = DocxConverterWithOCR() + with open(path, "rb") as f: + md = converter.convert(f, StreamInfo(extension=".docx")).text_content + assert "*[Image OCR]" not in md + assert "[End OCR]*" not in md diff --git a/packages/markitdown-ocr/tests/test_pdf_converter.py b/packages/markitdown-ocr/tests/test_pdf_converter.py new file mode 100644 index 000000000..5d4adcc5e --- /dev/null +++ b/packages/markitdown-ocr/tests/test_pdf_converter.py @@ -0,0 +1,234 @@ +""" +Unit tests for PdfConverterWithOCR. + +For each PDF test file: convert with a mock OCR service then compare the +full output string against the expected snapshot. + +OCR block format used by the converter: + *[Image OCR] + MOCK_OCR_TEXT_12345 + [End OCR]* +""" + +import io +import sys +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from markitdown_ocr._ocr_service import OCRResult # noqa: E402 +from markitdown_ocr._pdf_converter_with_ocr import ( # noqa: E402 + PdfConverterWithOCR, +) +from markitdown import StreamInfo # noqa: E402 + +TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data" + +_MOCK_TEXT = "MOCK_OCR_TEXT_12345" +_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*" +_PAGE_1_SCANNED = f"## Page 1\n\n\n\n\n{_OCR_BLOCK}" + + +class MockOCRService: + def extract_text( + self, # noqa: ANN101 + image_stream: Any, + **kwargs: Any, + ) -> OCRResult: + return OCRResult(text=_MOCK_TEXT, backend_used="mock") + + +@pytest.fixture(scope="module") +def svc() -> MockOCRService: + return MockOCRService() + + +def _convert(filename: str, ocr_service: MockOCRService) -> str: + path = TEST_DATA_DIR / filename + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + converter = PdfConverterWithOCR() + with open(path, "rb") as f: + return converter.convert( + f, StreamInfo(extension=".pdf"), ocr_service=ocr_service + ).text_content + + +# --------------------------------------------------------------------------- +# pdf_image_start.pdf +# --------------------------------------------------------------------------- + + +def test_pdf_image_start(svc: MockOCRService) -> None: + expected = ( + "## Page 1\n\n\n\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n" + "This is text BEFORE the image.\n\n" + "The image should appear above this text.\n\n" + "This is more content after the image." + ) + assert _convert("pdf_image_start.pdf", svc) == expected + + +# --------------------------------------------------------------------------- +# pdf_image_middle.pdf +# --------------------------------------------------------------------------- + + +def test_pdf_image_middle(svc: MockOCRService) -> None: + expected = ( + "## Page 1\n\n\n" + "Section 1: Introduction\n\n" + "This document contains an image in the middle.\n\n" + "Here is some introductory text.\n\n\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n" + "Section 2: Details\n\n" + "This text appears AFTER the image." + ) + assert _convert("pdf_image_middle.pdf", svc) == expected + + +# --------------------------------------------------------------------------- +# pdf_image_end.pdf +# --------------------------------------------------------------------------- + + +def test_pdf_image_end(svc: MockOCRService) -> None: + expected = ( + "## Page 1\n\n\n" + "Main Content\n\n" + "This is the main text content.\n\n" + "The image will appear at the end.\n\n" + "Keep reading...\n\n\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("pdf_image_end.pdf", svc) == expected + + +# --------------------------------------------------------------------------- +# pdf_multiple_images.pdf +# --------------------------------------------------------------------------- + + +def test_pdf_multiple_images(svc: MockOCRService) -> None: + expected = ( + "## Page 1\n\n\n" + "Document with Multiple Images\n\n\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n" + "Text between first and second image.\n\n\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n" + "Final text after all images." + ) + assert _convert("pdf_multiple_images.pdf", svc) == expected + + +# --------------------------------------------------------------------------- +# pdf_complex_layout.pdf +# --------------------------------------------------------------------------- + + +def test_pdf_complex_layout(svc: MockOCRService) -> None: + expected = ( + "## Page 1\n\n\n" + "Complex Layout Document\n\n" + "Table:\n\n" + "ItemQuantity\n\n\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n\n" + "Widget A5" + ) + assert _convert("pdf_complex_layout.pdf", svc) == expected + + +# --------------------------------------------------------------------------- +# pdf_multipage.pdf — pdfplumber/pdfminer fail (EOF); PyMuPDF fallback used +# --------------------------------------------------------------------------- + + +def test_pdf_multipage(svc: MockOCRService) -> None: + # pdfplumber cannot open this file (Unexpected EOF), so _ocr_full_pages + # falls back to PyMuPDF for page rendering. Each page becomes one OCR block. + expected = ( + f"## Page 1\n\n\n{_OCR_BLOCK}\n\n\n" + f"## Page 2\n\n\n{_OCR_BLOCK}\n\n\n" + f"## Page 3\n\n\n{_OCR_BLOCK}" + ) + assert _convert("pdf_multipage.pdf", svc) == expected + + +# --------------------------------------------------------------------------- +# pdf_scanned_*.pdf — raster-only pages → full-page OCR +# --------------------------------------------------------------------------- + + +def test_pdf_scanned_invoice(svc: MockOCRService) -> None: + assert _convert("pdf_scanned_invoice.pdf", svc) == _PAGE_1_SCANNED + + +def test_pdf_scanned_meeting_minutes(svc: MockOCRService) -> None: + assert _convert("pdf_scanned_meeting_minutes.pdf", svc) == _PAGE_1_SCANNED + + +def test_pdf_scanned_minimal(svc: MockOCRService) -> None: + assert _convert("pdf_scanned_minimal.pdf", svc) == _PAGE_1_SCANNED + + +def test_pdf_scanned_sales_report(svc: MockOCRService) -> None: + assert _convert("pdf_scanned_sales_report.pdf", svc) == _PAGE_1_SCANNED + + +def test_pdf_scanned_report(svc: MockOCRService) -> None: + expected = ( + f"{_PAGE_1_SCANNED}\n\n\n\n" + f"## Page 2\n\n\n\n\n{_OCR_BLOCK}\n\n\n\n" + f"## Page 3\n\n\n\n\n{_OCR_BLOCK}" + ) + assert _convert("pdf_scanned_report.pdf", svc) == expected + + +# --------------------------------------------------------------------------- +# Scanned PDF fallback path (pdfplumber finds no text → full-page OCR) +# --------------------------------------------------------------------------- + + +def test_pdf_scanned_fallback_format(svc: MockOCRService) -> None: + """_ocr_full_pages emits *[Image OCR]...[End OCR]* for each page.""" + path = TEST_DATA_DIR / "pdf_image_start.pdf" + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + + converter = PdfConverterWithOCR() + with patch("pdfplumber.open") as mock_plumber: + mock_pdf = MagicMock() + mock_page = MagicMock() + mock_page.page_number = 1 + mock_pdf.pages = [mock_page] + mock_pdf.__enter__.return_value = mock_pdf + mock_plumber.return_value = mock_pdf + + with open(path, "rb") as f: + md = converter._ocr_full_pages(io.BytesIO(f.read()), svc) + + expected = "## Page 1\n\n\n" "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + assert ( + md == expected + ), f"_ocr_full_pages must produce:\n{expected!r}\nActual:\n{md!r}" + + +# --------------------------------------------------------------------------- +# No OCR service — no OCR tags emitted +# --------------------------------------------------------------------------- + + +def test_pdf_no_ocr_service_no_tags() -> None: + path = TEST_DATA_DIR / "pdf_image_middle.pdf" + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + converter = PdfConverterWithOCR() + with open(path, "rb") as f: + md = converter.convert(f, StreamInfo(extension=".pdf")).text_content + assert "*[Image OCR]" not in md + assert "[End OCR]*" not in md diff --git a/packages/markitdown-ocr/tests/test_pptx_converter.py b/packages/markitdown-ocr/tests/test_pptx_converter.py new file mode 100644 index 000000000..724f1039c --- /dev/null +++ b/packages/markitdown-ocr/tests/test_pptx_converter.py @@ -0,0 +1,148 @@ +""" +Unit tests for PptxConverterWithOCR. + +For each PPTX test file: convert with a mock OCR service then compare the +full output string against the expected snapshot. + +OCR block format used by the converter: + *[Image OCR] + MOCK_OCR_TEXT_12345 + [End OCR]* + +Note: PPTX slide text uses literal backslash-n (\\n) sequences from the +underlying PPTX converter template; OCR blocks use real newlines. +""" + +import sys +from pathlib import Path +from typing import Any + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from markitdown_ocr._ocr_service import OCRResult # noqa: E402 +from markitdown_ocr._pptx_converter_with_ocr import ( # noqa: E402 + PptxConverterWithOCR, +) +from markitdown import StreamInfo # noqa: E402 + +TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data" + +_MOCK_TEXT = "MOCK_OCR_TEXT_12345" +_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*" + + +class MockOCRService: + def extract_text( + self, # noqa: ANN101 + image_stream: Any, + **kwargs: Any, + ) -> OCRResult: + return OCRResult(text=_MOCK_TEXT, backend_used="mock") + + +@pytest.fixture(scope="module") +def svc() -> MockOCRService: + return MockOCRService() + + +def _convert(filename: str, ocr_service: MockOCRService) -> str: + path = TEST_DATA_DIR / filename + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + converter = PptxConverterWithOCR() + with open(path, "rb") as f: + return converter.convert( + f, StreamInfo(extension=".pptx"), ocr_service=ocr_service + ).text_content + + +# --------------------------------------------------------------------------- +# pptx_image_start.pptx +# --------------------------------------------------------------------------- + + +def test_pptx_image_start(svc: MockOCRService) -> None: + # Slide 1: title "Welcome" followed by an image + expected = ( + "\\n\\n\\n# Welcome\\n\\n" + "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("pptx_image_start.pptx", svc) == expected + + +# --------------------------------------------------------------------------- +# pptx_image_middle.pptx +# --------------------------------------------------------------------------- + + +def test_pptx_image_middle(svc: MockOCRService) -> None: + # Slide 1: Introduction | Slide 2: Architecture + image | Slide 3: Conclusion # noqa: E501 + expected = ( + "\\n\\n\\n# Introduction" + "\\n\\n\\n\\n\\n# Architecture\\n\\n" + "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + "\\n\\n\\n# Conclusion\\n\\n" + ) + assert _convert("pptx_image_middle.pptx", svc) == expected + + +# --------------------------------------------------------------------------- +# pptx_image_end.pptx +# --------------------------------------------------------------------------- + + +def test_pptx_image_end(svc: MockOCRService) -> None: + # Slide 1: Presentation | Slide 2: Thank You + image + expected = ( + "\\n\\n\\n# Presentation" + "\\n\\n\\n\\n\\n# Thank You\\n\\n" + "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("pptx_image_end.pptx", svc) == expected + + +# --------------------------------------------------------------------------- +# pptx_multiple_images.pptx +# --------------------------------------------------------------------------- + + +def test_pptx_multiple_images(svc: MockOCRService) -> None: + # Slide 1: two images, no title text + expected = ( + "\\n\\n\\n# \\n" + "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + "\n\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("pptx_multiple_images.pptx", svc) == expected + + +# --------------------------------------------------------------------------- +# pptx_complex_layout.pptx +# --------------------------------------------------------------------------- + + +def test_pptx_complex_layout(svc: MockOCRService) -> None: + expected = ( + "\\n\\n\\n# Product Comparison" + "\\n\\nOur products lead the market\\n" + "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("pptx_complex_layout.pptx", svc) == expected + + +# --------------------------------------------------------------------------- +# No OCR service — no OCR tags emitted +# --------------------------------------------------------------------------- + + +def test_pptx_no_ocr_service_no_tags() -> None: + path = TEST_DATA_DIR / "pptx_image_middle.pptx" + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + converter = PptxConverterWithOCR() + with open(path, "rb") as f: + md = converter.convert(f, StreamInfo(extension=".pptx")).text_content + assert "*[Image OCR]" not in md + assert "[End OCR]*" not in md diff --git a/packages/markitdown-ocr/tests/test_xlsx_converter.py b/packages/markitdown-ocr/tests/test_xlsx_converter.py new file mode 100644 index 000000000..4ab30c600 --- /dev/null +++ b/packages/markitdown-ocr/tests/test_xlsx_converter.py @@ -0,0 +1,249 @@ +""" +Unit tests for XlsxConverterWithOCR. + +For each XLSX test file: convert with a mock OCR service then compare the +full output string against the expected snapshot. + +OCR block format used by the converter: + *[Image OCR] + MOCK_OCR_TEXT_12345 + [End OCR]* + +Images are grouped at the end of each sheet under: + ### Images in this sheet: +""" + +import sys +from pathlib import Path +from typing import Any + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) + +from markitdown_ocr._ocr_service import OCRResult # noqa: E402 +from markitdown_ocr._xlsx_converter_with_ocr import ( # noqa: E402 + XlsxConverterWithOCR, +) +from markitdown import StreamInfo # noqa: E402 + +TEST_DATA_DIR = Path(__file__).parent / "ocr_test_data" + +_MOCK_TEXT = "MOCK_OCR_TEXT_12345" +_OCR_BLOCK = f"*[Image OCR]\n{_MOCK_TEXT}\n[End OCR]*" +_IMG_SECTION = "### Images in this sheet:" + + +class MockOCRService: + def extract_text( + self, # noqa: ANN101 + image_stream: Any, + **kwargs: Any, + ) -> OCRResult: + return OCRResult(text=_MOCK_TEXT, backend_used="mock") + + +@pytest.fixture(scope="module") +def svc() -> MockOCRService: + return MockOCRService() + + +def _convert(filename: str, ocr_service: MockOCRService) -> str: + path = TEST_DATA_DIR / filename + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + converter = XlsxConverterWithOCR() + with open(path, "rb") as f: + return converter.convert( + f, StreamInfo(extension=".xlsx"), ocr_service=ocr_service + ).text_content + + +# --------------------------------------------------------------------------- +# xlsx_image_start.xlsx +# --------------------------------------------------------------------------- + + +def test_xlsx_image_start(svc: MockOCRService) -> None: + expected = ( + "## Sales Q1\n\n" + "| Product | Sales |\n" + "| --- | --- |\n" + "| Widget A | 100 |\n" + "| Widget B | 150 |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "## Forecast Q2\n\n" + "| Projected Sales | Unnamed: 1 |\n" + "| --- | --- |\n" + "| Widget A | 120 |\n" + "| Widget B | 180 |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("xlsx_image_start.xlsx", svc) == expected + + +# --------------------------------------------------------------------------- +# xlsx_image_middle.xlsx +# --------------------------------------------------------------------------- + + +def test_xlsx_image_middle(svc: MockOCRService) -> None: + expected = ( + "## Revenue\n\n" + "| Q1 Report | Unnamed: 1 |\n" + "| --- | --- |\n" + "| NaN | NaN |\n" + "| Revenue | $50,000 |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| Profit Margin | 40% |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "## Expenses\n\n" + "| Expense Breakdown | Unnamed: 1 |\n" + "| --- | --- |\n" + "| NaN | NaN |\n" + "| Expenses | $30,000 |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| Savings | $5,000 |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("xlsx_image_middle.xlsx", svc) == expected + + +# --------------------------------------------------------------------------- +# xlsx_image_end.xlsx +# --------------------------------------------------------------------------- + + +def test_xlsx_image_end(svc: MockOCRService) -> None: + expected = ( + "## Sheet\n\n" + "| Financial Summary | Unnamed: 1 |\n" + "| --- | --- |\n" + "| Total Revenue | $500,000 |\n" + "| Total Expenses | $300,000 |\n" + "| Net Profit | $200,000 |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| Signature: | NaN |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "## Budget\n\n" + "| Budget Allocation | Unnamed: 1 |\n" + "| --- | --- |\n" + "| Marketing | $100,000 |\n" + "| R&D | $150,000 |\n" + "| Operations | $50,000 |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| NaN | NaN |\n" + "| Approved: | NaN |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("xlsx_image_end.xlsx", svc) == expected + + +# --------------------------------------------------------------------------- +# xlsx_multiple_images.xlsx +# --------------------------------------------------------------------------- + + +def test_xlsx_multiple_images(svc: MockOCRService) -> None: + expected = ( + "## Overview\n\n" + "| Dashboard |\n" + "| --- |\n" + "| Status: Active |\n" + "| NaN |\n" + "| NaN |\n" + "| NaN |\n" + "| NaN |\n" + "| Performance Summary |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "## Details\n\n" + "| Detailed Metrics |\n" + "| --- |\n" + "| System Health |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "## Summary\n\n" + "| Quarter Summary |\n" + "| --- |\n" + "| Overall Performance |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("xlsx_multiple_images.xlsx", svc) == expected + + +# --------------------------------------------------------------------------- +# xlsx_complex_layout.xlsx +# --------------------------------------------------------------------------- + + +def test_xlsx_complex_layout(svc: MockOCRService) -> None: + expected = ( + "## Complex Report\n\n" + "| Annual Report 2024 | Unnamed: 1 |\n" + "| --- | --- |\n" + "| NaN | NaN |\n" + "| Month | Sales |\n" + "| Jan | 1000 |\n" + "| Feb | 1200 |\n" + "| NaN | NaN |\n" + "| Total | 2200 |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "## Customers\n\n" + "| Customer Metrics | Unnamed: 1 |\n" + "| --- | --- |\n" + "| NaN | NaN |\n" + "| New Customers | 250 |\n" + "| Retention Rate | 92% |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*\n\n" + "## Regions\n\n" + "| Regional Breakdown | Unnamed: 1 |\n" + "| --- | --- |\n" + "| NaN | NaN |\n" + "| Region | Revenue |\n" + "| North | $800K |\n" + "| South | $600K |\n\n" + "### Images in this sheet:\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + ) + assert _convert("xlsx_complex_layout.xlsx", svc) == expected + + +# --------------------------------------------------------------------------- +# No OCR service — no OCR tags emitted +# --------------------------------------------------------------------------- + + +def test_xlsx_no_ocr_service_no_tags() -> None: + path = TEST_DATA_DIR / "xlsx_image_middle.xlsx" + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + converter = XlsxConverterWithOCR() + with open(path, "rb") as f: + md = converter.convert(f, StreamInfo(extension=".xlsx")).text_content + assert "*[Image OCR]" not in md + assert "[End OCR]*" not in md diff --git a/packages/markitdown/src/markitdown/__about__.py b/packages/markitdown/src/markitdown/__about__.py index 3de6ec29f..ff0280657 100644 --- a/packages/markitdown/src/markitdown/__about__.py +++ b/packages/markitdown/src/markitdown/__about__.py @@ -1,4 +1,4 @@ # SPDX-FileCopyrightText: 2024-present Adam Fourney # # SPDX-License-Identifier: MIT -__version__ = "0.1.5" +__version__ = "0.1.6b1"