From fe25ac3804829cbcbfc73d67b849df019361b063 Mon Sep 17 00:00:00 2001
From: lyydsheep <lyydsheep@lyydsheepdeMacBook-Pro.local>
Date: Sat, 30 May 2026 20:45:18 +0800
Subject: [PATCH] fix: handle DOCX files with inconsistent ZIP filename casing
 (#1812)

Some document generators (e.g. certain Microsoft Word versions, legal
document systems) produce .docx files where the central directory records
one casing (e.g. 'customXml/item2.xml') but the local file headers record
another (e.g. 'customXML/item2.xml'). Python's zipfile module raises
BadZipFile when reading such files.

Add _fix_zip_filename_casing() to patch local file header filenames to
match the central directory before any ZIP processing occurs.
---
 .../converter_utils/docx/pre_process.py       | 57 ++++++++++++++++
 packages/markitdown/tests/test_module_misc.py | 67 +++++++++++++++++++
 2 files changed, 124 insertions(+)

diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
index d6fa8db69..16f4babbf 100644
--- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
@@ -1,3 +1,4 @@
+import struct
 import zipfile
 from io import BytesIO
 from typing import BinaryIO
@@ -115,6 +116,59 @@ def _pre_process_math(content: bytes) -> bytes:
     return str(soup).encode()
 
 
+def _fix_zip_filename_casing(input_docx: BinaryIO) -> BinaryIO:
+    """
+    Fix ZIP files where local file header filenames differ in casing
+    from the central directory filenames.
+
+    Some document generators (e.g. certain Microsoft Word versions,
+    legal document systems) produce .docx/.pptx files where the central
+    directory records one casing (e.g. 'customXml/item2.xml') but
+    the local file headers record another (e.g. 'customXML/item2.xml').
+    Python's zipfile module raises BadZipFile when reading such files.
+
+    This function patches local file header filenames to match the
+    central directory, which is the authoritative source used by
+    zipfile.ZipFile.
+    """
+    input_docx.seek(0)
+    raw = bytearray(input_docx.read())
+
+    # Read the central directory to get authoritative filenames
+    try:
+        with zipfile.ZipFile(BytesIO(raw), "r") as zf:
+            cd_entries = {
+                zi.header_offset: zi.orig_filename for zi in zf.infolist()
+            }
+    except zipfile.BadZipFile:
+        # Can't even read central directory — return as-is, let it fail later
+        input_docx.seek(0)
+        return input_docx
+
+    patched = False
+    for offset, cd_name in cd_entries.items():
+        # Verify local file header signature
+        if offset + 30 > len(raw) or raw[offset : offset + 4] != b"PK\x03\x04":
+            continue
+
+        local_fname_len = struct.unpack_from("<H", raw, offset + 26)[0]
+        if offset + 30 + local_fname_len > len(raw):
+            continue
+
+        local_name = bytes(raw[offset + 30 : offset + 30 + local_fname_len])
+        central_name = cd_name.encode("utf-8")
+
+        # Only patch if lengths match but content differs (casing mismatch)
+        if local_name != central_name and len(local_name) == len(central_name):
+            raw[offset + 30 : offset + 30 + local_fname_len] = central_name
+            patched = True
+
+    if patched:
+        return BytesIO(bytes(raw))
+    input_docx.seek(0)
+    return input_docx
+
+
 def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     """
     Pre-processes a DOCX file with provided steps.
@@ -129,6 +183,9 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
     Returns:
         BinaryIO: A binary output stream representing the processed DOCX file.
     """
+    # Fix ZIP filename casing mismatch before any processing
+    input_docx = _fix_zip_filename_casing(input_docx)
+
     output_docx = BytesIO()
     # The files that need to be pre-processed from .docx
     pre_process_enable_files = [
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 4d62e4919..8b17e3cdb 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -3,6 +3,7 @@
 import os
 import re
 import shutil
+import zipfile
 import pytest
 from unittest.mock import MagicMock
 
@@ -274,6 +275,72 @@ def test_docx_equations() -> None:
     assert block_equations, "No block equations found in the document."
 
 
+def test_docx_zip_filename_casing_mismatch() -> None:
+    """Test that DOCX files with inconsistent ZIP filename casing are handled.
+
+    Some document generators produce .docx files where the central directory
+    records one casing (e.g. 'word/document.xml') but the local file headers
+    record another (e.g. 'Word/Document.XML'). Python's zipfile module raises
+    BadZipFile when reading such files. This test verifies that MarkItDown
+    handles this gracefully.
+
+    See: https://github.com/microsoft/markitdown/issues/1812
+    """
+    import struct
+
+    markitdown = MarkItDown()
+    docx_file = os.path.join(TEST_FILES_DIR, "test.docx")
+
+    # Read the original docx and get its expected content
+    original_result = markitdown.convert(docx_file)
+    assert original_result.markdown.strip(), "Original DOCX should have content"
+
+    # Read raw bytes and corrupt the local file header filenames
+    with open(docx_file, "rb") as f:
+        raw = bytearray(f.read())
+
+    # Find all local file headers and uppercase their filenames
+    corrupted = bytearray(raw)
+    offset = 0
+    patched_count = 0
+    while offset + 30 <= len(corrupted):
+        if corrupted[offset : offset + 4] != b"PK\x03\x04":
+            break
+        fname_len = struct.unpack_from("<H", corrupted, offset + 26)[0]
+        extra_len = struct.unpack_from("<H", corrupted, offset + 28)[0]
+        if offset + 30 + fname_len > len(corrupted):
+            break
+        # Uppercase the filename in the local header
+        old_name = corrupted[offset + 30 : offset + 30 + fname_len]
+        new_name = old_name.upper()
+        if old_name != new_name:
+            corrupted[offset + 30 : offset + 30 + fname_len] = new_name
+            patched_count += 1
+        comp_size = struct.unpack_from("<I", corrupted, offset + 18)[0]
+        offset = offset + 30 + fname_len + extra_len + comp_size
+
+    assert patched_count > 0, "Should have patched at least one local file header"
+
+    # Verify the corrupted file would fail with plain zipfile
+    with pytest.raises(zipfile.BadZipFile):
+        with zipfile.ZipFile(io.BytesIO(bytes(corrupted)), "r") as zf:
+            for name in zf.namelist():
+                zf.read(name)
+
+    # Verify MarkItDown can still convert it
+    corrupted_result = markitdown.convert_stream(
+        io.BytesIO(bytes(corrupted)),
+        file_extension=".docx",
+    )
+    assert (
+        corrupted_result.markdown.strip()
+    ), "Corrupted DOCX should still produce content"
+    # Content should be equivalent to the original
+    assert (
+        original_result.markdown.strip() == corrupted_result.markdown.strip()
+    ), "Corrupted DOCX should produce the same output as original"
+
+
 def test_input_as_strings() -> None:
     markitdown = MarkItDown()