diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py index dfa734cdc..c67bc4890 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/math/omml.py @@ -373,7 +373,7 @@ def do_r(self, elm): @todo \text (latex pure text support) """ _str = [] - for s in elm.findtext("./{0}t".format(OMML_NS)): + for s in (elm.findtext("./{0}t".format(OMML_NS)) or ""): # s = s if isinstance(s,unicode) else unicode(s,'utf-8') _str.append(self._t_dict.get(s, s)) return escape_latex(BLANK.join(_str)) diff --git a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py index d6fa8db69..9060366df 100644 --- a/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py +++ b/packages/markitdown/src/markitdown/converter_utils/docx/pre_process.py @@ -1,3 +1,4 @@ +import logging import zipfile from io import BytesIO from typing import BinaryIO @@ -5,6 +6,8 @@ from bs4 import BeautifulSoup, Tag +logger = logging.getLogger(__name__) + from .math.omml import OMML_NS, oMath2Latex MATH_ROOT_TEMPLATE = "".join( @@ -109,9 +112,15 @@ def _pre_process_math(content: bytes) -> bytes: """ soup = BeautifulSoup(content.decode(), features="xml") for tag in soup.find_all("oMathPara"): - _replace_equations(tag) + try: + _replace_equations(tag) + except Exception: + logger.warning("Failed to convert oMathPara equation to LaTeX", exc_info=True) for tag in soup.find_all("oMath"): - _replace_equations(tag) + try: + _replace_equations(tag) + except Exception: + logger.warning("Failed to convert oMath equation to LaTeX", exc_info=True) return str(soup).encode() diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..ecc9e1be3 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -274,6 +274,28 @@ def test_docx_equations() -> None: assert block_equations, "No block equations found in the document." +def test_docx_equations_omit_empty_run() -> None: + """Regression test: m:r elements without m:t child must not crash the + OMML-to-LaTeX conversion and must not cause other equations to be lost.""" + from markitdown.converter_utils.docx.pre_process import _pre_process_math + + # An oMath with one formatting-only m:r (no m:t) followed by a normal m:r + content = ( + b'' + b'' + b"" + b"" + b" " + b" x" + b"" + b"" + ) + result = _pre_process_math(content).decode() + # The equation should be present and wrapped in $ signs + assert "$x$" in result + + def test_input_as_strings() -> None: markitdown = MarkItDown()