From e63f1eb7e02b3c920f8f24cedc462ca3485e88aa Mon Sep 17 00:00:00 2001 From: venti <1308199824@qq.com> Date: Sat, 30 May 2026 16:26:48 +0800 Subject: [PATCH] fix: decode exiftool JSON output as UTF-8 instead of locale encoding ExifTool always outputs JSON in UTF-8 (per RFC 8259), but the exiftool_metadata() function decoded the output using locale.getpreferredencoding(). On systems with non-UTF-8 locale encoding (e.g. Windows with Chinese locale cp936), this caused UnicodeDecodeError when ExifTool returned non-ASCII metadata. Fix: - Use encoding='utf-8' instead of text=True in version-check subprocess.run() so stdout is decoded as UTF-8. - Decode JSON output bytes with 'utf-8' instead of locale.getpreferredencoding(). - Remove unused import of locale. Fixes #1972 --- .../src/markitdown/converters/_exiftool.py | 5 ++--- packages/markitdown/tests/test_module_misc.py | 22 +++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/packages/markitdown/src/markitdown/converters/_exiftool.py b/packages/markitdown/src/markitdown/converters/_exiftool.py index f605024fd..8766fcc4a 100644 --- a/packages/markitdown/src/markitdown/converters/_exiftool.py +++ b/packages/markitdown/src/markitdown/converters/_exiftool.py @@ -1,5 +1,4 @@ import json -import locale import subprocess from typing import Any, BinaryIO, Union @@ -22,7 +21,7 @@ def exiftool_metadata( version_output = subprocess.run( [exiftool_path, "-ver"], capture_output=True, - text=True, + encoding="utf-8", check=True, ).stdout.strip() version = _parse_version(version_output) @@ -46,7 +45,7 @@ def exiftool_metadata( ).stdout return json.loads( - output.decode(locale.getpreferredencoding(False)), + output.decode("utf-8"), )[0] finally: file_stream.seek(cur_pos) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..4fe916e2a 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -462,6 +462,27 @@ def test_markitdown_exiftool() -> None: assert target in result.text_content +def test_exiftool_metadata_decodes_utf8() -> None: + """ExifTool JSON output must be decoded as UTF-8 per RFC 8259.""" + from unittest.mock import patch, MagicMock + from markitdown.converters._exiftool import exiftool_metadata + + version_mock = MagicMock(stdout="13.0\n") + # JSON with non-ASCII characters encoded in UTF-8 (C3 89 = É, C3 A9 = é) + metadata_mock = MagicMock( + stdout=b'[{"Author":"\xc3\x89lena","Title":"Caf\xc3\xa9 R\xc3\xa9sum\xc3\xa9"}]' + ) + + with patch("subprocess.run") as mock_run: + mock_run.side_effect = [version_mock, metadata_mock] + result = exiftool_metadata( + io.BytesIO(b"fake-data"), exiftool_path="/fake/exiftool" + ) + + assert result["Author"] == "Élena" + assert result["Title"] == "Café Résumé" + + def test_markitdown_llm_parameters() -> None: """Test that LLM parameters are correctly passed to the client.""" mock_client = MagicMock() @@ -545,6 +566,7 @@ def test_markitdown_llm() -> None: test_exceptions, test_doc_rlink, test_markitdown_exiftool, + test_exiftool_metadata_decodes_utf8, test_markitdown_llm_parameters, test_markitdown_llm, ]: