diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py index dabb0d7d3..029b27f57 100644 --- a/packages/markitdown/src/markitdown/converters/_html_converter.py +++ b/packages/markitdown/src/markitdown/converters/_html_converter.py @@ -1,4 +1,5 @@ import io +import warnings from typing import Any, BinaryIO, Optional from bs4 import BeautifulSoup @@ -44,6 +45,10 @@ def convert( stream_info: StreamInfo, **kwargs: Any, # Options to pass to the converter ) -> DocumentConverterResult: + # Pop our own keyword before forwarding the rest to markdownify. + # strict=True raises RecursionError instead of falling back to plain text. + strict: bool = kwargs.pop("strict", False) + # Parse the stream encoding = "utf-8" if stream_info.charset is None else stream_info.charset soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding) @@ -55,10 +60,25 @@ def convert( # Print only the main content body_elm = soup.find("body") webpage_text = "" - if body_elm: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) - else: - webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) + try: + if body_elm: + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm) + else: + webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup) + except RecursionError: + if strict: + raise + # Large or deeply-nested HTML can exceed Python's recursion limit + # during markdownify's recursive DOM traversal. Fall back to + # BeautifulSoup's iterative get_text() so the caller still gets + # usable plain-text content instead of raw HTML. + warnings.warn( + "HTML document is too deeply nested for markdown conversion " + "(RecursionError). Falling back to plain-text extraction.", + stacklevel=2, + ) + target = body_elm if body_elm else soup + webpage_text = target.get_text("\n", strip=True) assert isinstance(webpage_text, str) diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..4d62e4919 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -288,6 +288,56 @@ def test_input_as_strings() -> None: assert "# Test" in result.text_content +def test_deeply_nested_html_fallback() -> None: + """Large, deeply nested HTML should fall back to plain-text extraction + instead of silently returning unconverted HTML (issue #1636). + + Note: This test uses sys.setrecursionlimit to guarantee a RecursionError + regardless of the host environment's default limit, making it deterministic + across different platforms and CI configurations. + """ + import sys + import warnings + + markitdown = MarkItDown() + + # Use a small recursion limit so the test is environment-independent. + # We restore the original limit in a finally block to avoid side-effects. + original_limit = sys.getrecursionlimit() + low_limit = 200 # well below markdownify's traversal depth for depth=500 + + # Build HTML with nesting deep enough to trigger RecursionError + depth = 500 + html = "
" + for _ in range(depth): + html += 'Deep content with bold text
" + for _ in range(depth): + html += "