Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import struct
import zipfile
from io import BytesIO
from typing import BinaryIO
Expand Down Expand Up @@ -115,6 +116,59 @@ def _pre_process_math(content: bytes) -> bytes:
return str(soup).encode()


def _fix_zip_filename_casing(input_docx: BinaryIO) -> BinaryIO:
"""
Fix ZIP files where local file header filenames differ in casing
from the central directory filenames.

Some document generators (e.g. certain Microsoft Word versions,
legal document systems) produce .docx/.pptx files where the central
directory records one casing (e.g. 'customXml/item2.xml') but
the local file headers record another (e.g. 'customXML/item2.xml').
Python's zipfile module raises BadZipFile when reading such files.

This function patches local file header filenames to match the
central directory, which is the authoritative source used by
zipfile.ZipFile.
"""
input_docx.seek(0)
raw = bytearray(input_docx.read())

# Read the central directory to get authoritative filenames
try:
with zipfile.ZipFile(BytesIO(raw), "r") as zf:
cd_entries = {
zi.header_offset: zi.orig_filename for zi in zf.infolist()
}
except zipfile.BadZipFile:
# Can't even read central directory — return as-is, let it fail later
input_docx.seek(0)
return input_docx

patched = False
for offset, cd_name in cd_entries.items():
# Verify local file header signature
if offset + 30 > len(raw) or raw[offset : offset + 4] != b"PK\x03\x04":
continue

local_fname_len = struct.unpack_from("<H", raw, offset + 26)[0]
if offset + 30 + local_fname_len > len(raw):
continue

local_name = bytes(raw[offset + 30 : offset + 30 + local_fname_len])
central_name = cd_name.encode("utf-8")

# Only patch if lengths match but content differs (casing mismatch)
if local_name != central_name and len(local_name) == len(central_name):
raw[offset + 30 : offset + 30 + local_fname_len] = central_name
patched = True

if patched:
return BytesIO(bytes(raw))
input_docx.seek(0)
return input_docx


def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
"""
Pre-processes a DOCX file with provided steps.
Expand All @@ -129,6 +183,9 @@ def pre_process_docx(input_docx: BinaryIO) -> BinaryIO:
Returns:
BinaryIO: A binary output stream representing the processed DOCX file.
"""
# Fix ZIP filename casing mismatch before any processing
input_docx = _fix_zip_filename_casing(input_docx)

output_docx = BytesIO()
# The files that need to be pre-processed from .docx
pre_process_enable_files = [
Expand Down
67 changes: 67 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import shutil
import zipfile
import pytest
from unittest.mock import MagicMock

Expand Down Expand Up @@ -274,6 +275,72 @@ def test_docx_equations() -> None:
assert block_equations, "No block equations found in the document."


def test_docx_zip_filename_casing_mismatch() -> None:
"""Test that DOCX files with inconsistent ZIP filename casing are handled.

Some document generators produce .docx files where the central directory
records one casing (e.g. 'word/document.xml') but the local file headers
record another (e.g. 'Word/Document.XML'). Python's zipfile module raises
BadZipFile when reading such files. This test verifies that MarkItDown
handles this gracefully.

See: https://github.com/microsoft/markitdown/issues/1812
"""
import struct

markitdown = MarkItDown()
docx_file = os.path.join(TEST_FILES_DIR, "test.docx")

# Read the original docx and get its expected content
original_result = markitdown.convert(docx_file)
assert original_result.markdown.strip(), "Original DOCX should have content"

# Read raw bytes and corrupt the local file header filenames
with open(docx_file, "rb") as f:
raw = bytearray(f.read())

# Find all local file headers and uppercase their filenames
corrupted = bytearray(raw)
offset = 0
patched_count = 0
while offset + 30 <= len(corrupted):
if corrupted[offset : offset + 4] != b"PK\x03\x04":
break
fname_len = struct.unpack_from("<H", corrupted, offset + 26)[0]
extra_len = struct.unpack_from("<H", corrupted, offset + 28)[0]
if offset + 30 + fname_len > len(corrupted):
break
# Uppercase the filename in the local header
old_name = corrupted[offset + 30 : offset + 30 + fname_len]
new_name = old_name.upper()
if old_name != new_name:
corrupted[offset + 30 : offset + 30 + fname_len] = new_name
patched_count += 1
comp_size = struct.unpack_from("<I", corrupted, offset + 18)[0]
offset = offset + 30 + fname_len + extra_len + comp_size

assert patched_count > 0, "Should have patched at least one local file header"

# Verify the corrupted file would fail with plain zipfile
with pytest.raises(zipfile.BadZipFile):
with zipfile.ZipFile(io.BytesIO(bytes(corrupted)), "r") as zf:
for name in zf.namelist():
zf.read(name)

# Verify MarkItDown can still convert it
corrupted_result = markitdown.convert_stream(
io.BytesIO(bytes(corrupted)),
file_extension=".docx",
)
assert (
corrupted_result.markdown.strip()
), "Corrupted DOCX should still produce content"
# Content should be equivalent to the original
assert (
original_result.markdown.strip() == corrupted_result.markdown.strip()
), "Corrupted DOCX should produce the same output as original"


def test_input_as_strings() -> None:
markitdown = MarkItDown()

Expand Down