From a73ec36729163276ca73005887c58fd551478b96 Mon Sep 17 00:00:00 2001 From: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Date: Sun, 31 May 2026 10:44:30 +0800 Subject: [PATCH] fix(docx): preserve underlined text --- .../markitdown/converters/_docx_converter.py | 6 +++ .../src/markitdown/converters/_markdownify.py | 12 ++++++ packages/markitdown/tests/test_module_misc.py | 37 +++++++++++++++++++ 3 files changed, 55 insertions(+) diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 3975107b1..e59f88019 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -27,6 +27,8 @@ ACCEPTED_FILE_EXTENSIONS = [".docx"] +_UNDERLINE_STYLE_MAP = "u => u" + class DocxConverter(HtmlConverter): """ @@ -76,6 +78,10 @@ def convert( ) style_map = kwargs.get("style_map", None) + if style_map: + style_map = f"{style_map}\n{_UNDERLINE_STYLE_MAP}" + else: + style_map = _UNDERLINE_STYLE_MAP pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, diff --git a/packages/markitdown/src/markitdown/converters/_markdownify.py b/packages/markitdown/src/markitdown/converters/_markdownify.py index 19e8a2984..1a0b831be 100644 --- a/packages/markitdown/src/markitdown/converters/_markdownify.py +++ b/packages/markitdown/src/markitdown/converters/_markdownify.py @@ -122,5 +122,17 @@ def convert_input( return "[x] " if el.has_attr("checked") else "[ ] " return "" + def convert_u( + self, + el: Any, + text: str, + convert_as_inline: Optional[bool] = False, + **kwargs, + ) -> str: + prefix, suffix, text = markdownify.chomp(text) # type: ignore + if not text: + return "" + return f"{prefix}{text}{suffix}" + def convert_soup(self, soup: Any) -> str: return super().convert_soup(soup) # type: ignore diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..d90e0f759 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -3,6 +3,7 @@ import os import re import shutil +import zipfile import pytest from unittest.mock import MagicMock @@ -261,6 +262,42 @@ def test_docx_comments() -> None: validate_strings(result, DOCX_COMMENT_TEST_STRINGS) +def test_docx_underlined_text_is_preserved(tmp_path) -> None: + docx_file = tmp_path / "underlined.docx" + document_xml = """ + + + + plain + underlined + + +""" + + with zipfile.ZipFile(docx_file, "w") as archive: + archive.writestr( + "[Content_Types].xml", + """ + + + + +""", + ) + archive.writestr( + "_rels/.rels", + """ + + +""", + ) + archive.writestr("word/document.xml", document_xml) + + result = MarkItDown().convert(str(docx_file)) + + assert "plain underlined" in result.text_content + + def test_docx_equations() -> None: markitdown = MarkItDown() docx_file = os.path.join(TEST_FILES_DIR, "equations.docx")