Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import zipfile
from io import BytesIO
from typing import BinaryIO
from xml.etree import ElementTree as ET
from defusedxml import ElementTree as ET
from lxml import etree

from bs4 import BeautifulSoup, Tag

Expand Down Expand Up @@ -107,7 +108,14 @@ def _pre_process_math(content: bytes) -> bytes:
Returns:
bytes: The processed content with OMML elements replaced by their LaTeX equivalents, encoded as bytes.
"""
soup = BeautifulSoup(content.decode(), features="xml")
# Sanitize XML to prevent XXE and entity expansion attacks (CWE-611).
# Parse with lxml using resolve_entities=False so that external/internal
# entity references are never expanded, then re-serialize the clean tree.
safe_parser = etree.XMLParser(resolve_entities=False, no_network=True)
tree = etree.fromstring(content, parser=safe_parser)
sanitized_content = etree.tostring(tree, xml_declaration=True, encoding="utf-8")

soup = BeautifulSoup(sanitized_content, features="xml")
for tag in soup.find_all("oMathPara"):
_replace_equations(tag)
for tag in soup.find_all("oMath"):
Expand Down
104 changes: 104 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import re
import shutil
import zipfile
import pytest
from unittest.mock import MagicMock

Expand Down Expand Up @@ -274,6 +275,109 @@ def test_docx_equations() -> None:
assert block_equations, "No block equations found in the document."


def _build_docx_with_xml(document_xml: bytes) -> io.BytesIO:
"""Helper: build a minimal DOCX zip containing the given word/document.xml."""
content_types = b"""\
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">
<Default Extension="rels"
ContentType="application/vnd.openxmlformats-package.relationships+xml"/>
<Default Extension="xml" ContentType="application/xml"/>
<Override PartName="/word/document.xml"
ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>
</Types>"""
root_rels = b"""\
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId1"
Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument"
Target="word/document.xml"/>
</Relationships>"""
word_rels = b"""\
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"/>"""

buf = io.BytesIO()
with zipfile.ZipFile(buf, mode="w", compression=zipfile.ZIP_DEFLATED) as zf:
zf.writestr("[Content_Types].xml", content_types)
zf.writestr("_rels/.rels", root_rels)
zf.writestr("word/_rels/document.xml.rels", word_rels)
zf.writestr("word/document.xml", document_xml)
buf.seek(0)
return buf


def test_docx_xxe_entity_blocked() -> None:
"""Regression test: DOCX with XXE payload must not leak local file contents."""
from markitdown.converter_utils.docx.pre_process import pre_process_docx

xxe_xml = b"""\
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<!DOCTYPE w:document [
<!ENTITY xxe SYSTEM "file:///etc/hostname">
]>
<w:document
xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p>
<m:oMathPara>
<m:oMath>&xxe;</m:oMath>
</m:oMathPara>
</w:p>
</w:body>
</w:document>"""

malicious_docx = _build_docx_with_xml(xxe_xml)

# Read the target file so we know what to look for
target_content = ""
try:
target_content = open("/etc/hostname").read().strip()
except FileNotFoundError:
pass # Non-Linux; the entity simply won't resolve — still must not crash

# pre_process_docx should either sanitize the payload or fall back gracefully
result_buf = pre_process_docx(malicious_docx)
result_buf.seek(0)
with zipfile.ZipFile(result_buf) as zf:
doc_xml = zf.read("word/document.xml").decode(errors="replace")

if target_content:
assert target_content not in doc_xml, (
"XXE payload was expanded — local file content leaked into output"
)


def test_docx_billion_laughs_blocked() -> None:
"""Regression test: DOCX with Billion Laughs must not cause entity expansion."""
from markitdown.converter_utils.docx.pre_process import pre_process_docx

bomb_xml = b"""\
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<!DOCTYPE w:document [
<!ENTITY lol "lol">
<!ENTITY lol2 "&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;&lol;">
<!ENTITY lol3 "&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;&lol2;">
<!ENTITY lol4 "&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;&lol3;">
<!ENTITY lol5 "&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;&lol4;">
]>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>&lol5;</w:t></w:r></w:p></w:body>
</w:document>"""

malicious_docx = _build_docx_with_xml(bomb_xml)
result_buf = pre_process_docx(malicious_docx)
result_buf.seek(0)
with zipfile.ZipFile(result_buf) as zf:
doc_xml = zf.read("word/document.xml").decode(errors="replace")

# If entity expansion happened, "lol" would appear thousands of times
assert doc_xml.count("lol") < 100, (
"Billion Laughs entity expansion detected in output"
)


def test_input_as_strings() -> None:
markitdown = MarkItDown()

Expand Down