diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py index d6fa8db69..277469abd 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -1,7 +1,8 @@ import zipfile from io import BytesIO from typing import BinaryIO -from xml.etree import ElementTree as ET +from defusedxml import ElementTree as ET +from lxml import etree from bs4 import BeautifulSoup, Tag @@ -107,7 +108,14 @@ def _pre_process_math(content: bytes) -> bytes: Returns: bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes. """ - soup = BeautifulSoup(content.decode(), features="xml") + # Sanitize XML to prevent XXE and entity expansion attacks (CWE-611). + # Parse with lxml using resolve_entities=False so that external/internal + # entity references are never expanded, then re-serialize the clean tree. + safe_parser = etree.XMLParser(resolve_entities=False, no_network=True) + tree = etree.fromstring(content, parser=safe_parser) + sanitized_content = etree.tostring(tree, xml_declaration=True, encoding="utf-8") + + soup = BeautifulSoup(sanitized_content, features="xml") for tag in soup.find_all("oMathPara"): _replace_equations(tag) for tag in soup.find_all("oMath"): diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..6fcefbfa5 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -3,6 +3,7 @@ import os import re import shutil +import zipfile import pytest from unittest.mock import MagicMock @@ -274,6 +275,109 @@ def test_docx_equations() -> None: assert block_equations, "No block equations found in the document." +def _build_docx_with_xml(document_xml: bytes) -> io.BytesIO: + """Helper: build a minimal DOCX zip containing the given word/document.xml.""" + content_types = b"""\ + + + + + +""" + root_rels = b"""\ + + + +""" + word_rels = b"""\ + +""" + + buf = io.BytesIO() + with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", content_types) + zf.writestr("_rels/.rels", root_rels) + zf.writestr("word/_rels/document.xml.rels", word_rels) + zf.writestr("word/document.xml", document_xml) + buf.seek(0) + return buf + + +def test_docx_xxe_entity_blocked() -> None: + """Regression test: DOCX with XXE payload must not leak local file contents.""" + from markitdown.converter_utils.docx.pre_process import pre_process_docx + + xxe_xml = b"""\ + + +]> + + + + + &xxe; + + + +""" + + malicious_docx = _build_docx_with_xml(xxe_xml) + + # Read the target file so we know what to look for + target_content = "" + try: + target_content = open("/etc/hostname").read().strip() + except FileNotFoundError: + pass # Non-Linux; the entity simply won't resolve — still must not crash + + # pre_process_docx should either sanitize the payload or fall back gracefully + result_buf = pre_process_docx(malicious_docx) + result_buf.seek(0) + with zipfile.ZipFile(result_buf) as zf: + doc_xml = zf.read("word/document.xml").decode(errors="replace") + + if target_content: + assert target_content not in doc_xml, ( + "XXE payload was expanded — local file content leaked into output" + ) + + +def test_docx_billion_laughs_blocked() -> None: + """Regression test: DOCX with Billion Laughs must not cause entity expansion.""" + from markitdown.converter_utils.docx.pre_process import pre_process_docx + + bomb_xml = b"""\ + + + + + + +]> + + &lol5; +""" + + malicious_docx = _build_docx_with_xml(bomb_xml) + result_buf = pre_process_docx(malicious_docx) + result_buf.seek(0) + with zipfile.ZipFile(result_buf) as zf: + doc_xml = zf.read("word/document.xml").decode(errors="replace") + + # If entity expansion happened, "lol" would appear thousands of times + assert doc_xml.count("lol") < 100, ( + "Billion Laughs entity expansion detected in output" + ) + + def test_input_as_strings() -> None: markitdown = MarkItDown()