Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def convert(

for slide in presentation.slides:
slide_num += 1
md_content += f"\\n\\n<!-- Slide number: {slide_num} -->\\n"
md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"

title = slide.shapes.title

Expand All @@ -96,7 +96,7 @@ def get_shape_content(shape, **kwargs):
llm_description = ""
if llm_client and kwargs.get("llm_model"):
try:
from ._llm_caption import llm_caption
from markitdown.converters._llm_caption import llm_caption

image_filename = shape.image.filename
image_extension = None
Expand Down Expand Up @@ -148,9 +148,9 @@ def get_shape_content(shape, **kwargs):
# Text areas
elif shape.has_text_frame:
if shape == title:
md_content += "# " + shape.text.lstrip() + "\\n"
md_content += "# " + shape.text.lstrip() + "\n"
else:
md_content += shape.text + "\\n"
md_content += shape.text + "\n"

# Group Shapes
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.GROUP:
Expand All @@ -177,7 +177,7 @@ def get_shape_content(shape, **kwargs):
md_content = md_content.strip()

if slide.has_notes_slide:
md_content += "\\n\\n### Notes:\\n"
md_content += "\n\n### Notes:\n"
notes_frame = slide.notes_slide.notes_text_frame
if notes_frame is not None:
md_content += notes_frame.text
Expand Down Expand Up @@ -216,15 +216,15 @@ def _convert_table_to_markdown(self, table, **kwargs):

return (
self._html_converter.convert_string(html_table, **kwargs).markdown.strip()
+ "\\n"
+ "\n"
)

def _convert_chart_to_markdown(self, chart):
try:
md = "\\n\\n### Chart"
md = "\n\n### Chart"
if chart.has_title:
md += f": {chart.chart_title.text_frame.text}"
md += "\\n\\n"
md += "\n\n"
data = []
category_names = [c.label for c in chart.plots[0].categories]
series_names = [s.name for s in chart.series]
Expand All @@ -241,9 +241,9 @@ def _convert_chart_to_markdown(self, chart):
markdown_table.append("| " + " | ".join(map(str, row)) + " |")
header = markdown_table[0]
separator = "|" + "|".join(["---"] * len(data[0])) + "|"
return md + "\\n".join([header, separator] + markdown_table[1:])
return md + "\n".join([header, separator] + markdown_table[1:])
except ValueError as e:
if "unsupported plot type" in str(e):
return "\\n\\n[unsupported chart]\\n\\n"
return "\n\n[unsupported chart]\n\n"
except Exception:
return "\\n\\n[unsupported chart]\\n\\n"
return "\n\n[unsupported chart]\n\n"
57 changes: 41 additions & 16 deletions packages/markitdown-ocr/tests/test_pptx_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,12 @@
MOCK_OCR_TEXT_12345
[End OCR]*

Note: PPTX slide text uses literal backslash-n (\\n) sequences from the
underlying PPTX converter template; OCR blocks use real newlines.
"""

import sys
from pathlib import Path
from typing import Any
from unittest.mock import patch

import pytest

Expand Down Expand Up @@ -66,8 +65,8 @@ def _convert(filename: str, ocr_service: MockOCRService) -> str:
def test_pptx_image_start(svc: MockOCRService) -> None:
# Slide 1: title "Welcome" followed by an image
expected = (
"\\n\\n<!-- Slide number: 1 -->\\n# Welcome\\n\\n"
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
"<!-- Slide number: 1 -->\n# Welcome\n\n\n"
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
)
assert _convert("pptx_image_start.pptx", svc) == expected

Expand All @@ -80,10 +79,10 @@ def test_pptx_image_start(svc: MockOCRService) -> None:
def test_pptx_image_middle(svc: MockOCRService) -> None:
# Slide 1: Introduction | Slide 2: Architecture + image | Slide 3: Conclusion # noqa: E501
expected = (
"\\n\\n<!-- Slide number: 1 -->\\n# Introduction"
"\\n\\n\\n\\n<!-- Slide number: 2 -->\\n# Architecture\\n\\n"
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
"\\n\\n<!-- Slide number: 3 -->\\n# Conclusion\\n\\n"
"<!-- Slide number: 1 -->\n# Introduction"
"\n\n<!-- Slide number: 2 -->\n# Architecture\n\n\n"
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
"\n\n<!-- Slide number: 3 -->\n# Conclusion"
)
assert _convert("pptx_image_middle.pptx", svc) == expected

Expand All @@ -96,9 +95,9 @@ def test_pptx_image_middle(svc: MockOCRService) -> None:
def test_pptx_image_end(svc: MockOCRService) -> None:
# Slide 1: Presentation | Slide 2: Thank You + image
expected = (
"\\n\\n<!-- Slide number: 1 -->\\n# Presentation"
"\\n\\n\\n\\n<!-- Slide number: 2 -->\\n# Thank You\\n\\n"
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
"<!-- Slide number: 1 -->\n# Presentation"
"\n\n<!-- Slide number: 2 -->\n# Thank You\n\n\n"
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
)
assert _convert("pptx_image_end.pptx", svc) == expected

Expand All @@ -111,8 +110,8 @@ def test_pptx_image_end(svc: MockOCRService) -> None:
def test_pptx_multiple_images(svc: MockOCRService) -> None:
# Slide 1: two images, no title text
expected = (
"\\n\\n<!-- Slide number: 1 -->\\n# \\n"
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
"<!-- Slide number: 1 -->\n# \n\n"
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
"\n\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
)
assert _convert("pptx_multiple_images.pptx", svc) == expected
Expand All @@ -125,9 +124,9 @@ def test_pptx_multiple_images(svc: MockOCRService) -> None:

def test_pptx_complex_layout(svc: MockOCRService) -> None:
expected = (
"\\n\\n<!-- Slide number: 1 -->\\n# Product Comparison"
"\\n\\nOur products lead the market\\n"
"\n*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
"<!-- Slide number: 1 -->\n# Product Comparison"
"\n\nOur products lead the market\n\n"
"*[Image OCR]\nMOCK_OCR_TEXT_12345\n[End OCR]*"
)
assert _convert("pptx_complex_layout.pptx", svc) == expected

Expand All @@ -146,3 +145,29 @@ def test_pptx_no_ocr_service_no_tags() -> None:
md = converter.convert(f, StreamInfo(extension=".pptx")).text_content
assert "*[Image OCR]" not in md
assert "[End OCR]*" not in md


def test_pptx_output_does_not_contain_literal_newlines(svc: MockOCRService) -> None:
md = _convert("pptx_image_start.pptx", svc)
assert "\\n" not in md


def test_pptx_uses_core_llm_caption() -> None:
path = TEST_DATA_DIR / "pptx_image_start.pptx"
if not path.exists():
pytest.skip(f"Test file not found: {path}")

converter = PptxConverterWithOCR()
with patch(
"markitdown.converters._llm_caption.llm_caption", return_value="MOCK_CAPTION"
) as mock_caption:
with open(path, "rb") as f:
md = converter.convert(
f,
StreamInfo(extension=".pptx"),
llm_client=object(),
llm_model="test-model",
).text_content

assert "MOCK_CAPTION" in md
mock_caption.assert_called_once()