Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 44 additions & 1 deletion packages/markitdown/src/markitdown/converters/_xlsx_converter.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import sys
import re
import numbers
from typing import BinaryIO, Any
from ._html_converter import HtmlConverter
from .._base_converter import DocumentConverter, DocumentConverterResult
Expand All @@ -10,7 +12,7 @@
_xlsx_dependency_exc_info = None
try:
import pandas as pd
import openpyxl # noqa: F401
import openpyxl
except ImportError:
_xlsx_dependency_exc_info = sys.exc_info()

Expand All @@ -32,6 +34,30 @@
]
ACCEPTED_XLS_FILE_EXTENSIONS = [".xls"]

_CURRENCY_SYMBOL_RE = re.compile(
r"(?<!\\)([$\u20ac\u00a3\u00a5])|\[\$([^-\]]+)(?:-[^\]]+)?\]"
)


def _format_currency_value(value: Any, number_format: str) -> str | None:
match = _CURRENCY_SYMBOL_RE.search(number_format)
if (
match is None
or isinstance(value, bool)
or not isinstance(value, numbers.Real)
or pd.isna(value)
):
return None

symbol = match.group(1) or match.group(2)
positive_format = number_format.split(";", 1)[0]
decimal_part = positive_format.split(".", 1)[1] if "." in positive_format else ""
precision = sum(1 for char in decimal_part if char in "0#")
use_grouping = "," in positive_format.split(".", 1)[0]

grouping = "," if use_grouping else ""
return f"{symbol}{value:{grouping}.{precision}f}"


class XlsxConverter(DocumentConverter):
"""
Expand Down Expand Up @@ -81,8 +107,25 @@ def convert(
)

sheets = pd.read_excel(file_stream, sheet_name=None, engine="openpyxl")
file_stream.seek(0)
workbook = openpyxl.load_workbook(file_stream, read_only=True, data_only=True)

md_content = ""
for s in sheets:
worksheet = workbook[s]
for row_idx in range(len(sheets[s].index)):
for col_idx in range(len(sheets[s].columns)):
cell = worksheet.cell(row=row_idx + 2, column=col_idx + 1)
currency_value = _format_currency_value(
sheets[s].iat[row_idx, col_idx],
cell.number_format,
)
if currency_value is not None:
sheets[s][sheets[s].columns[col_idx]] = sheets[s][
sheets[s].columns[col_idx]
].astype(object)
sheets[s].iat[row_idx, col_idx] = currency_value

md_content += f"## {s}\n"
html_content = sheets[s].to_html(index=False)
md_content += (
Expand Down
18 changes: 18 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,24 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content


def test_xlsx_currency_format_is_preserved(tmp_path) -> None:
from openpyxl import Workbook

workbook = Workbook()
worksheet = workbook.active
worksheet.title = "Invoice"
worksheet.append(["Item", "Count", "Cost"])
worksheet.append(["Breakfast", 20, 5])
worksheet["C2"].number_format = "$#,##0.00"

xlsx_path = tmp_path / "currency.xlsx"
workbook.save(xlsx_path)

result = MarkItDown().convert(str(xlsx_path))

assert "| Breakfast | 20 | $5.00 |" in result.markdown


def test_deeply_nested_html_fallback() -> None:
"""Large, deeply nested HTML should fall back to plain-text extraction
instead of silently returning unconverted HTML (issue #1636).
Expand Down