From 9f1e255d9fbaddb6136de507fa64d9f4fb7c1335 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B5=85=E5=90=9F=E5=8D=8A=E5=A4=8F?= <836205695@qq.com> Date: Sun, 31 May 2026 09:29:41 +0800 Subject: [PATCH] fix: fallback when CSV charset hint fails --- .../src/markitdown/converters/_csv_converter.py | 8 ++++++-- packages/markitdown/tests/test_module_misc.py | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_csv_converter.py b/packages/markitdown/src/markitdown/converters/_csv_converter.py index 7e9631e1b..4567bff71 100644 --- a/packages/markitdown/src/markitdown/converters/_csv_converter.py +++ b/packages/markitdown/src/markitdown/converters/_csv_converter.py @@ -42,10 +42,14 @@ def convert( **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: # Read the file content + data = file_stream.read() if stream_info.charset: - content = file_stream.read().decode(stream_info.charset) + try: + content = data.decode(stream_info.charset) + except UnicodeDecodeError: + content = str(from_bytes(data).best()) else: - content = str(from_bytes(file_stream.read()).best()) + content = str(from_bytes(data).best()) # Parse CSV content reader = csv.reader(io.StringIO(content)) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..d3b4570ec 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -220,6 +220,20 @@ def test_data_uris() -> None: assert data == b"Hello, World!" +def test_csv_falls_back_when_charset_hint_fails() -> None: + content = ("x" * 4096 + ",café\n").encode("utf-8") + result = MarkItDown().convert( + io.BytesIO(content), + stream_info=StreamInfo( + extension=".csv", + mimetype="text/csv", + charset="ascii", + ), + ) + + assert "café" in result.markdown + + def test_file_uris() -> None: # Test file URI with an empty host file_uri = "file:///path/to/file.txt"