diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py
index 3975107b1..e59f88019 100644
--- a/packages/markitdown/src/markitdown/converters/_docx_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py
@@ -27,6 +27,8 @@
ACCEPTED_FILE_EXTENSIONS = [".docx"]
+_UNDERLINE_STYLE_MAP = "u => u"
+
class DocxConverter(HtmlConverter):
"""
@@ -76,6 +78,10 @@ def convert(
)
style_map = kwargs.get("style_map", None)
+ if style_map:
+ style_map = f"{style_map}\n{_UNDERLINE_STYLE_MAP}"
+ else:
+ style_map = _UNDERLINE_STYLE_MAP
pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py
index 19e8a2984..1a0b831be 100644
--- a/packages/markitdown/src/markitdown/converters/_markdownify.py
+++ b/packages/markitdown/src/markitdown/converters/_markdownify.py
@@ -122,5 +122,17 @@ def convert_input(
return "[x] " if el.has_attr("checked") else "[ ] "
return ""
+ def convert_u(
+ self,
+ el: Any,
+ text: str,
+ convert_as_inline: Optional[bool] = False,
+ **kwargs,
+ ) -> str:
+ prefix, suffix, text = markdownify.chomp(text) # type: ignore
+ if not text:
+ return ""
+ return f"{prefix}{text}{suffix}"
+
def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 4d62e4919..d90e0f759 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -3,6 +3,7 @@
import os
import re
import shutil
+import zipfile
import pytest
from unittest.mock import MagicMock
@@ -261,6 +262,42 @@ def test_docx_comments() -> None:
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)
+def test_docx_underlined_text_is_preserved(tmp_path) -> None:
+ docx_file = tmp_path / "underlined.docx"
+ document_xml = """
+
+
+
+ plain
+ underlined
+
+
+"""
+
+ with zipfile.ZipFile(docx_file, "w") as archive:
+ archive.writestr(
+ "[Content_Types].xml",
+ """
+
+
+
+
+""",
+ )
+ archive.writestr(
+ "_rels/.rels",
+ """
+
+
+""",
+ )
+ archive.writestr("word/document.xml", document_xml)
+
+ result = MarkItDown().convert(str(docx_file))
+
+ assert "plain underlined" in result.text_content
+
+
def test_docx_equations() -> None:
markitdown = MarkItDown()
docx_file = os.path.join(TEST_FILES_DIR, "equations.docx")