Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ def do_r(self, elm):
@todo \text (latex pure text support)
"""
_str = []
for s in elm.findtext("./{0}t".format(OMML_NS)):
for s in (elm.findtext("./{0}t".format(OMML_NS)) or ""):
# s = s if isinstance(s,unicode) else unicode(s,'utf-8')
_str.append(self._t_dict.get(s, s))
return escape_latex(BLANK.join(_str))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import logging
import zipfile
from io import BytesIO
from typing import BinaryIO
from xml.etree import ElementTree as ET

from bs4 import BeautifulSoup, Tag

logger = logging.getLogger(__name__)

from .math.omml import OMML_NS, oMath2Latex

MATH_ROOT_TEMPLATE = "".join(
Expand Down Expand Up @@ -109,9 +112,15 @@ def _pre_process_math(content: bytes) -> bytes:
"""
soup = BeautifulSoup(content.decode(), features="xml")
for tag in soup.find_all("oMathPara"):
_replace_equations(tag)
try:
_replace_equations(tag)
except Exception:
logger.warning("Failed to convert oMathPara equation to LaTeX", exc_info=True)
for tag in soup.find_all("oMath"):
_replace_equations(tag)
try:
_replace_equations(tag)
except Exception:
logger.warning("Failed to convert oMath equation to LaTeX", exc_info=True)
return str(soup).encode()


Expand Down
22 changes: 22 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,28 @@ def test_docx_equations() -> None:
assert block_equations, "No block equations found in the document."


def test_docx_equations_omit_empty_run() -> None:
"""Regression test: m:r elements without m:t child must not crash the
OMML-to-LaTeX conversion and must not cause other equations to be lost."""
from markitdown.converter_utils.docx.pre_process import _pre_process_math

# An oMath with one formatting-only m:r (no m:t) followed by a normal m:r
content = (
b'<?xml version="1.0" encoding="UTF-8"?>'
b'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"'
b' xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math">'
b"<w:body><w:p>"
b"<m:oMath>"
b" <m:r><m:rPr><m:sty m:val=\"bi\"/></m:rPr></m:r>"
b" <m:r><m:t>x</m:t></m:r>"
b"</m:oMath>"
b"</w:p></w:body></w:document>"
)
result = _pre_process_math(content).decode()
# The equation should be present and wrapped in $ signs
assert "$x$" in result


def test_input_as_strings() -> None:
markitdown = MarkItDown()

Expand Down