diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
index d6fa8db69..277469abd 100644
--- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
+++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py
@@ -1,7 +1,8 @@
import zipfile
from io import BytesIO
from typing import BinaryIO
-from xml.etree import ElementTree as ET
+from defusedxml import ElementTree as ET
+from lxml import etree
from bs4 import BeautifulSoup, Tag
@@ -107,7 +108,14 @@ def _pre_process_math(content: bytes) -> bytes:
Returns:
bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes.
"""
- soup = BeautifulSoup(content.decode(), features="xml")
+ # Sanitize XML to prevent XXE and entity expansion attacks (CWE-611).
+ # Parse with lxml using resolve_entities=False so that external/internal
+ # entity references are never expanded, then re-serialize the clean tree.
+ safe_parser = etree.XMLParser(resolve_entities=False, no_network=True)
+ tree = etree.fromstring(content, parser=safe_parser)
+ sanitized_content = etree.tostring(tree, xml_declaration=True, encoding="utf-8")
+
+ soup = BeautifulSoup(sanitized_content, features="xml")
for tag in soup.find_all("oMathPara"):
_replace_equations(tag)
for tag in soup.find_all("oMath"):
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 8e3acc23d..6fcefbfa5 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -3,6 +3,7 @@
import os
import re
import shutil
+import zipfile
import pytest
from unittest.mock import MagicMock
@@ -274,6 +275,109 @@ def test_docx_equations() -> None:
assert block_equations, "No block equations found in the document."
+def _build_docx_with_xml(document_xml: bytes) -> io.BytesIO:
+ """Helper: build a minimal DOCX zip containing the given word/document.xml."""
+ content_types = b"""\
+
+
+
+
+
+"""
+ root_rels = b"""\
+
+
+
+"""
+ word_rels = b"""\
+
+"""
+
+ buf = io.BytesIO()
+ with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
+ zf.writestr("[Content_Types].xml", content_types)
+ zf.writestr("_rels/.rels", root_rels)
+ zf.writestr("word/_rels/document.xml.rels", word_rels)
+ zf.writestr("word/document.xml", document_xml)
+ buf.seek(0)
+ return buf
+
+
+def test_docx_xxe_entity_blocked() -> None:
+ """Regression test: DOCX with XXE payload must not leak local file contents."""
+ from markitdown.converter_utils.docx.pre_process import pre_process_docx
+
+ xxe_xml = b"""\
+
+
+]>
+
+
+
+
+ &xxe;
+
+
+
+"""
+
+ malicious_docx = _build_docx_with_xml(xxe_xml)
+
+ # Read the target file so we know what to look for
+ target_content = ""
+ try:
+ target_content = open("/etc/hostname").read().strip()
+ except FileNotFoundError:
+ pass # Non-Linux; the entity simply won't resolve — still must not crash
+
+ # pre_process_docx should either sanitize the payload or fall back gracefully
+ result_buf = pre_process_docx(malicious_docx)
+ result_buf.seek(0)
+ with zipfile.ZipFile(result_buf) as zf:
+ doc_xml = zf.read("word/document.xml").decode(errors="replace")
+
+ if target_content:
+ assert target_content not in doc_xml, (
+ "XXE payload was expanded — local file content leaked into output"
+ )
+
+
+def test_docx_billion_laughs_blocked() -> None:
+ """Regression test: DOCX with Billion Laughs must not cause entity expansion."""
+ from markitdown.converter_utils.docx.pre_process import pre_process_docx
+
+ bomb_xml = b"""\
+
+
+
+
+
+
+]>
+
+ &lol5;
+"""
+
+ malicious_docx = _build_docx_with_xml(bomb_xml)
+ result_buf = pre_process_docx(malicious_docx)
+ result_buf.seek(0)
+ with zipfile.ZipFile(result_buf) as zf:
+ doc_xml = zf.read("word/document.xml").decode(errors="replace")
+
+ # If entity expansion happened, "lol" would appear thousands of times
+ assert doc_xml.count("lol") < 100, (
+ "Billion Laughs entity expansion detected in output"
+ )
+
+
def test_input_as_strings() -> None:
markitdown = MarkItDown()