Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@

ACCEPTED_FILE_EXTENSIONS = [".docx"]

_UNDERLINE_STYLE_MAP = "u => u"


class DocxConverter(HtmlConverter):
"""
Expand Down Expand Up @@ -76,6 +78,10 @@ def convert(
)

style_map = kwargs.get("style_map", None)
if style_map:
style_map = f"{style_map}\n{_UNDERLINE_STYLE_MAP}"
else:
style_map = _UNDERLINE_STYLE_MAP
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
Expand Down
12 changes: 12 additions & 0 deletions packages/markitdown/src/markitdown/converters/_markdownify.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,5 +122,17 @@ def convert_input(
return "[x] " if el.has_attr("checked") else "[ ] "
return ""

def convert_u(
self,
el: Any,
text: str,
convert_as_inline: Optional[bool] = False,
**kwargs,
) -> str:
prefix, suffix, text = markdownify.chomp(text) # type: ignore
if not text:
return ""
return f"{prefix}<u>{text}</u>{suffix}"

def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
37 changes: 37 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import shutil
import zipfile
import pytest
from unittest.mock import MagicMock

Expand Down Expand Up @@ -261,6 +262,42 @@ def test_docx_comments() -> None:
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)


def test_docx_underlined_text_is_preserved(tmp_path) -> None:
docx_file = tmp_path / "underlined.docx"
document_xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<w:r><w:t>plain </w:t></w:r>
<w:r><w:rPr><w:u w:val="single"/></w:rPr><w:t>underlined</w:t></w:r>
</w:p>
</w:body>
</w:document>"""

with zipfile.ZipFile(docx_file, "w") as archive:
archive.writestr(
"[Content_Types].xml",
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>""",
)
archive.writestr(
"_rels/.rels",
"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>
</Relationships>""",
)
archive.writestr("word/document.xml", document_xml)

result = MarkItDown().convert(str(docx_file))

assert "plain <u>underlined</u>" in result.text_content


def test_docx_equations() -> None:
markitdown = MarkItDown()
docx_file = os.path.join(TEST_FILES_DIR, "equations.docx")
Expand Down