From 4a47c0d189f6e1a58b9123a1ac68d3737056ccdf Mon Sep 17 00:00:00 2001 From: Yabets Mebratu Date: Sun, 31 May 2026 17:14:41 -0700 Subject: [PATCH] fix: patch CWE-611 XXE vulnerabilities --- .../markitdown/converter_utils/docx/pre_process.py | 2 +- .../src/markitdown/converters/_epub_converter.py | 10 ++++++++-- .../src/markitdown/converters/_rss_converter.py | 11 +++++++++-- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py index d6fa8db69..4481d9720 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -1,7 +1,7 @@ import zipfile from io import BytesIO from typing import BinaryIO -from xml.etree import ElementTree as ET +from defusedxml import ElementTree as ET from bs4 import BeautifulSoup, Tag diff --git a/packages/markitdown/src/markitdown/converters/_epub_converter.py b/packages/markitdown/src/markitdown/converters/_epub_converter.py index 3be65b016..30b03b07f 100644 --- a/packages/markitdown/src/markitdown/converters/_epub_converter.py +++ b/packages/markitdown/src/markitdown/converters/_epub_converter.py @@ -1,14 +1,20 @@ +"""EPUB converter module.""" +# nosemgrep: use-defused-xml +# The xml.dom.minidom imports below are ONLY for type annotations (TYPE_CHECKING block). +# All actual XML parsing uses defusedxml.minidom.parse() which is secure. import os import zipfile from defusedxml import minidom -from xml.dom.minidom import Document -from typing import BinaryIO, Any, Dict, List +from typing import TYPE_CHECKING, BinaryIO, Any, Dict, List from ._html_converter import HtmlConverter from .._base_converter import DocumentConverterResult from .._stream_info import StreamInfo +if TYPE_CHECKING: + from xml.dom.minidom import Document # nosemgrep: use-defused-xml + ACCEPTED_MIME_TYPE_PREFIXES = [ "application/epub", "application/epub+zip", diff --git a/packages/markitdown/src/markitdown/converters/_rss_converter.py b/packages/markitdown/src/markitdown/converters/_rss_converter.py index bec42484f..346cce8c6 100644 --- a/packages/markitdown/src/markitdown/converters/_rss_converter.py +++ b/packages/markitdown/src/markitdown/converters/_rss_converter.py @@ -1,12 +1,19 @@ +"""RSS/Atom feed converter module.""" +# nosemgrep: use-defused-xml +# The xml.dom.minidom imports below are ONLY for type annotations (TYPE_CHECKING block). +# All actual XML parsing uses defusedxml.minidom.parse() which is secure. from defusedxml import minidom -from xml.dom.minidom import Document, Element -from typing import BinaryIO, Any, Union + +from typing import TYPE_CHECKING, BinaryIO, Any, Union from bs4 import BeautifulSoup from ._markdownify import _CustomMarkdownify from .._stream_info import StreamInfo from .._base_converter import DocumentConverter, DocumentConverterResult +if TYPE_CHECKING: + from xml.dom.minidom import Document, Element # nosemgrep: use-defused-xml + PRECISE_MIME_TYPE_PREFIXES = [ "application/rss", "application/rss+xml",