diff --git a/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py b/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py index 7e91ed6b4..285ff397b 100644 --- a/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py +++ b/packages/markitdown-ocr/src/markitdown_ocr/_pptx_converter_with_ocr.py @@ -80,7 +80,7 @@ def convert( for slide in presentation.slides: slide_num += 1 - md_content += f"\\n\\n\\n" + md_content += f"\n\n\n" title = slide.shapes.title @@ -96,7 +96,7 @@ def get_shape_content(shape, **kwargs): llm_description = "" if llm_client and kwargs.get("llm_model"): try: - from ._llm_caption import llm_caption + from markitdown.converters._llm_caption import llm_caption image_filename = shape.image.filename image_extension = None @@ -148,9 +148,9 @@ def get_shape_content(shape, **kwargs): # Text areas elif shape.has_text_frame: if shape == title: - md_content += "# " + shape.text.lstrip() + "\\n" + md_content += "# " + shape.text.lstrip() + "\n" else: - md_content += shape.text + "\\n" + md_content += shape.text + "\n" # Group Shapes if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP: @@ -177,7 +177,7 @@ def get_shape_content(shape, **kwargs): md_content = md_content.strip() if slide.has_notes_slide: - md_content += "\\n\\n### Notes:\\n" + md_content += "\n\n### Notes:\n" notes_frame = slide.notes_slide.notes_text_frame if notes_frame is not None: md_content += notes_frame.text @@ -216,15 +216,15 @@ def _convert_table_to_markdown(self, table, **kwargs): return ( self._html_converter.convert_string(html_table, **kwargs).markdown.strip() - + "\\n" + + "\n" ) def _convert_chart_to_markdown(self, chart): try: - md = "\\n\\n### Chart" + md = "\n\n### Chart" if chart.has_title: md += f": {chart.chart_title.text_frame.text}" - md += "\\n\\n" + md += "\n\n" data = [] category_names = [c.label for c in chart.plots[0].categories] series_names = [s.name for s in chart.series] @@ -241,9 +241,9 @@ def _convert_chart_to_markdown(self, chart): markdown_table.append("| " + " | ".join(map(str, row)) + " |") header = markdown_table[0] separator = "|" + "|".join(["---"] * len(data[0])) + "|" - return md + "\\n".join([header, separator] + markdown_table[1:]) + return md + "\n".join([header, separator] + markdown_table[1:]) except ValueError as e: if "unsupported plot type" in str(e): - return "\\n\\n[unsupported chart]\\n\\n" + return "\n\n[unsupported chart]\n\n" except Exception: - return "\\n\\n[unsupported chart]\\n\\n" + return "\n\n[unsupported chart]\n\n" diff --git a/packages/markitdown-ocr/tests/test_pptx_converter.py b/packages/markitdown-ocr/tests/test_pptx_converter.py index 724f1039c..98cf9fe0f 100644 --- a/packages/markitdown-ocr/tests/test_pptx_converter.py +++ b/packages/markitdown-ocr/tests/test_pptx_converter.py @@ -9,13 +9,12 @@ MOCK_OCR_TEXT_12345 [End OCR]* -Note: PPTX slide text uses literal backslash-n (\\n) sequences from the -underlying PPTX converter template; OCR blocks use real newlines. """ import sys from pathlib import Path from typing import Any +from unittest.mock import patch import pytest @@ -66,8 +65,8 @@ def _convert(filename: str, ocr_service: MockOCRService) -> str: def test_pptx_image_start(svc: MockOCRService) -> None: # Slide 1: title "Welcome" followed by an image expected = ( - "\\n\\n\\n# Welcome\\n\\n" - "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + "\n# Welcome\n\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" ) assert _convert("pptx_image_start.pptx", svc) == expected @@ -80,10 +79,10 @@ def test_pptx_image_start(svc: MockOCRService) -> None: def test_pptx_image_middle(svc: MockOCRService) -> None: # Slide 1: Introduction | Slide 2: Architecture + image | Slide 3: Conclusion # noqa: E501 expected = ( - "\\n\\n\\n# Introduction" - "\\n\\n\\n\\n\\n# Architecture\\n\\n" - "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" - "\\n\\n\\n# Conclusion\\n\\n" + "\n# Introduction" + "\n\n\n# Architecture\n\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + "\n\n\n# Conclusion" ) assert _convert("pptx_image_middle.pptx", svc) == expected @@ -96,9 +95,9 @@ def test_pptx_image_middle(svc: MockOCRService) -> None: def test_pptx_image_end(svc: MockOCRService) -> None: # Slide 1: Presentation | Slide 2: Thank You + image expected = ( - "\\n\\n\\n# Presentation" - "\\n\\n\\n\\n\\n# Thank You\\n\\n" - "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + "\n# Presentation" + "\n\n\n# Thank You\n\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" ) assert _convert("pptx_image_end.pptx", svc) == expected @@ -111,8 +110,8 @@ def test_pptx_image_end(svc: MockOCRService) -> None: def test_pptx_multiple_images(svc: MockOCRService) -> None: # Slide 1: two images, no title text expected = ( - "\\n\\n\\n# \\n" - "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + "\n# \n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" "\n\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" ) assert _convert("pptx_multiple_images.pptx", svc) == expected @@ -125,9 +124,9 @@ def test_pptx_multiple_images(svc: MockOCRService) -> None: def test_pptx_complex_layout(svc: MockOCRService) -> None: expected = ( - "\\n\\n\\n# Product Comparison" - "\\n\\nOur products lead the market\\n" - "\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" + "\n# Product Comparison" + "\n\nOur products lead the market\n\n" + "*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*" ) assert _convert("pptx_complex_layout.pptx", svc) == expected @@ -146,3 +145,29 @@ def test_pptx_no_ocr_service_no_tags() -> None: md = converter.convert(f, StreamInfo(extension=".pptx")).text_content assert "*[Image OCR]" not in md assert "[End OCR]*" not in md + + +def test_pptx_output_does_not_contain_literal_newlines(svc: MockOCRService) -> None: + md = _convert("pptx_image_start.pptx", svc) + assert "\\n" not in md + + +def test_pptx_uses_core_llm_caption() -> None: + path = TEST_DATA_DIR / "pptx_image_start.pptx" + if not path.exists(): + pytest.skip(f"Test file not found: {path}") + + converter = PptxConverterWithOCR() + with patch( + "markitdown.converters._llm_caption.llm_caption", return_value="MOCK_CAPTION" + ) as mock_caption: + with open(path, "rb") as f: + md = converter.convert( + f, + StreamInfo(extension=".pptx"), + llm_client=object(), + llm_model="test-model", + ).text_content + + assert "MOCK_CAPTION" in md + mock_caption.assert_called_once()