From bfd8f7de6f9af80112de920fae4efb10c012a44c Mon Sep 17 00:00:00 2001 From: venti <1308199824@qq.com> Date: Sat, 30 May 2026 23:31:23 +0800 Subject: [PATCH] fix: preserve xlsx currency formatting --- .../markitdown/converters/_xlsx_converter.py | 45 ++++++++++++++++++- packages/markitdown/tests/test_module_misc.py | 18 ++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) diff --git a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py index 4186ec773..823db7768 100644 --- a/packages/markitdown/src/markitdown/converters/_xlsx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_xlsx_converter.py @@ -1,4 +1,6 @@ import sys +import re +import numbers from typing import BinaryIO, Any from ._html_converter import HtmlConverter from .._base_converter import DocumentConverter, DocumentConverterResult @@ -10,7 +12,7 @@ _xlsx_dependency_exc_info = None try: import pandas as pd - import openpyxl # noqa: F401 + import openpyxl except ImportError: _xlsx_dependency_exc_info = sys.exc_info() @@ -32,6 +34,30 @@ ] ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"] +_CURRENCY_SYMBOL_RE = re.compile( + r"(? str | None: + match = _CURRENCY_SYMBOL_RE.search(number_format) + if ( + match is None + or isinstance(value, bool) + or not isinstance(value, numbers.Real) + or pd.isna(value) + ): + return None + + symbol = match.group(1) or match.group(2) + positive_format = number_format.split(";", 1)[0] + decimal_part = positive_format.split(".", 1)[1] if "." in positive_format else "" + precision = sum(1 for char in decimal_part if char in "0#") + use_grouping = "," in positive_format.split(".", 1)[0] + + grouping = "," if use_grouping else "" + return f"{symbol}{value:{grouping}.{precision}f}" + class XlsxConverter(DocumentConverter): """ @@ -81,8 +107,25 @@ def convert( ) sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl") + file_stream.seek(0) + workbook = openpyxl.load_workbook(file_stream, read_only=True, data_only=True) + md_content = "" for s in sheets: + worksheet = workbook[s] + for row_idx in range(len(sheets[s].index)): + for col_idx in range(len(sheets[s].columns)): + cell = worksheet.cell(row=row_idx + 2, column=col_idx + 1) + currency_value = _format_currency_value( + sheets[s].iat[row_idx, col_idx], + cell.number_format, + ) + if currency_value is not None: + sheets[s][sheets[s].columns[col_idx]] = sheets[s][ + sheets[s].columns[col_idx] + ].astype(object) + sheets[s].iat[row_idx, col_idx] = currency_value + md_content += f"## {s}\n" html_content = sheets[s].to_html(index=False) md_content += ( diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..fa19f5fda 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -288,6 +288,24 @@ def test_input_as_strings() -> None: assert "# Test" in result.text_content +def test_xlsx_currency_format_is_preserved(tmp_path) -> None: + from openpyxl import Workbook + + workbook = Workbook() + worksheet = workbook.active + worksheet.title = "Invoice" + worksheet.append(["Item", "Count", "Cost"]) + worksheet.append(["Breakfast", 20, 5]) + worksheet["C2"].number_format = "$#,##0.00" + + xlsx_path = tmp_path / "currency.xlsx" + workbook.save(xlsx_path) + + result = MarkItDown().convert(str(xlsx_path)) + + assert "| Breakfast | 20 | $5.00 |" in result.markdown + + def test_deeply_nested_html_fallback() -> None: """Large, deeply nested HTML should fall back to plain-text extraction instead of silently returning unconverted HTML (issue #1636).