From fe25ac3804829cbcbfc73d67b849df019361b063 Mon Sep 17 00:00:00 2001 From: lyydsheep Date: Sat, 30 May 2026 20:45:18 +0800 Subject: [PATCH] fix: handle DOCX files with inconsistent ZIP filename casing (#1812) Some document generators (e.g. certain Microsoft Word versions, legal document systems) produce .docx files where the central directory records one casing (e.g. 'customXml/item2.xml') but the local file headers record another (e.g. 'customXML/item2.xml'). Python's zipfile module raises BadZipFile when reading such files. Add _fix_zip_filename_casing() to patch local file header filenames to match the central directory before any ZIP processing occurs. --- .../converter_utils/docx/pre_process.py | 57 ++++++++++++++++ packages/markitdown/tests/test_module_misc.py | 67 +++++++++++++++++++ 2 files changed, 124 insertions(+) diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py index d6fa8db69..16f4babbf 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -1,3 +1,4 @@ +import struct import zipfile from io import BytesIO from typing import BinaryIO @@ -115,6 +116,59 @@ def _pre_process_math(content: bytes) -> bytes: return str(soup).encode() +def _fix_zip_filename_casing(input_docx: BinaryIO) -> BinaryIO: + """ + Fix ZIP files where local file header filenames differ in casing + from the central directory filenames. + + Some document generators (e.g. certain Microsoft Word versions, + legal document systems) produce .docx/.pptx files where the central + directory records one casing (e.g. 'customXml/item2.xml') but + the local file headers record another (e.g. 'customXML/item2.xml'). + Python's zipfile module raises BadZipFile when reading such files. + + This function patches local file header filenames to match the + central directory, which is the authoritative source used by + zipfile.ZipFile. + """ + input_docx.seek(0) + raw = bytearray(input_docx.read()) + + # Read the central directory to get authoritative filenames + try: + with zipfile.ZipFile(BytesIO(raw), "r") as zf: + cd_entries = { + zi.header_offset: zi.orig_filename for zi in zf.infolist() + } + except zipfile.BadZipFile: + # Can't even read central directory — return as-is, let it fail later + input_docx.seek(0) + return input_docx + + patched = False + for offset, cd_name in cd_entries.items(): + # Verify local file header signature + if offset + 30 > len(raw) or raw[offset : offset + 4] != b"PK\x03\x04": + continue + + local_fname_len = struct.unpack_from(" len(raw): + continue + + local_name = bytes(raw[offset + 30 : offset + 30 + local_fname_len]) + central_name = cd_name.encode("utf-8") + + # Only patch if lengths match but content differs (casing mismatch) + if local_name != central_name and len(local_name) == len(central_name): + raw[offset + 30 : offset + 30 + local_fname_len] = central_name + patched = True + + if patched: + return BytesIO(bytes(raw)) + input_docx.seek(0) + return input_docx + + def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: """ Pre-processes a DOCX file with provided steps. @@ -129,6 +183,9 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO: Returns: BinaryIO: A binary output stream representing the processed DOCX file. """ + # Fix ZIP filename casing mismatch before any processing + input_docx = _fix_zip_filename_casing(input_docx) + output_docx = BytesIO() # The files that need to be pre-processed from .docx pre_process_enable_files = [ diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 4d62e4919..8b17e3cdb 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -3,6 +3,7 @@ import os import re import shutil +import zipfile import pytest from unittest.mock import MagicMock @@ -274,6 +275,72 @@ def test_docx_equations() -> None: assert block_equations, "No block equations found in the document." +def test_docx_zip_filename_casing_mismatch() -> None: + """Test that DOCX files with inconsistent ZIP filename casing are handled. + + Some document generators produce .docx files where the central directory + records one casing (e.g. 'word/document.xml') but the local file headers + record another (e.g. 'Word/Document.XML'). Python's zipfile module raises + BadZipFile when reading such files. This test verifies that MarkItDown + handles this gracefully. + + See: https://github.com/microsoft/markitdown/issues/1812 + """ + import struct + + markitdown = MarkItDown() + docx_file = os.path.join(TEST_FILES_DIR, "test.docx") + + # Read the original docx and get its expected content + original_result = markitdown.convert(docx_file) + assert original_result.markdown.strip(), "Original DOCX should have content" + + # Read raw bytes and corrupt the local file header filenames + with open(docx_file, "rb") as f: + raw = bytearray(f.read()) + + # Find all local file headers and uppercase their filenames + corrupted = bytearray(raw) + offset = 0 + patched_count = 0 + while offset + 30 <= len(corrupted): + if corrupted[offset : offset + 4] != b"PK\x03\x04": + break + fname_len = struct.unpack_from(" len(corrupted): + break + # Uppercase the filename in the local header + old_name = corrupted[offset + 30 : offset + 30 + fname_len] + new_name = old_name.upper() + if old_name != new_name: + corrupted[offset + 30 : offset + 30 + fname_len] = new_name + patched_count += 1 + comp_size = struct.unpack_from(" 0, "Should have patched at least one local file header" + + # Verify the corrupted file would fail with plain zipfile + with pytest.raises(zipfile.BadZipFile): + with zipfile.ZipFile(io.BytesIO(bytes(corrupted)), "r") as zf: + for name in zf.namelist(): + zf.read(name) + + # Verify MarkItDown can still convert it + corrupted_result = markitdown.convert_stream( + io.BytesIO(bytes(corrupted)), + file_extension=".docx", + ) + assert ( + corrupted_result.markdown.strip() + ), "Corrupted DOCX should still produce content" + # Content should be equivalent to the original + assert ( + original_result.markdown.strip() == corrupted_result.markdown.strip() + ), "Corrupted DOCX should produce the same output as original" + + def test_input_as_strings() -> None: markitdown = MarkItDown()