From fb9e80b9027625946db33182cd5b453c8276da06 Mon Sep 17 00:00:00 2001
From: jigangz <jigangz@github.com>
Date: Fri, 27 Mar 2026 22:56:37 -0700
Subject: [PATCH 1/3] fix: handle deeply nested HTML that triggers
 RecursionError (#1636)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Large HTML files with deep DOM nesting (e.g., SEC EDGAR filings) cause
markdownify's recursive DOM traversal to exceed Python's default
recursion limit (1000). Previously this RecursionError was caught by
the top-level _convert() dispatcher, which then fell through to
PlainTextConverter — silently returning the raw HTML as 'markdown'
with no warning.

This fix catches RecursionError in HtmlConverter.convert() and falls
back to BeautifulSoup's iterative get_text() method, which handles
arbitrary nesting depths. A warning is emitted so callers know the
output is plain text rather than full markdown.

Root cause chain:
1. HtmlConverter.convert() calls markdownify.convert_soup() (recursive)
2. Deeply nested HTML (>~400 levels) triggers RecursionError
3. _convert() catches all Exceptions, stores in failed_attempts
4. PlainTextConverter.accepts() matches text/html via 'text/' prefix
5. PlainTextConverter.convert() returns raw HTML bytes as text
6. Caller receives 'markdown' that is actually unconverted HTML
---
 .../markitdown/converters/_html_converter.py  | 23 +++++++++---
 packages/markitdown/tests/test_module_misc.py | 35 +++++++++++++++++++
 2 files changed, 54 insertions(+), 4 deletions(-)
diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
index dabb0d7d3..f95480d2c 100644
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -55,10 +55,25 @@ def convert(
         # Print only the main content
         body_elm = soup.find("body")
         webpage_text = ""
-        if body_elm:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
-        else:
-            webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+        try:
+            if body_elm:
+                webpage_text = _CustomMarkdownify(**kwargs).convert_soup(body_elm)
+            else:
+                webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
+        except RecursionError:
+            # Large or deeply-nested HTML can exceed Python's recursion limit
+            # during markdownify's recursive DOM traversal.  Fall back to
+            # BeautifulSoup's iterative get_text() so the caller still gets
+            # usable plain-text content instead of raw HTML.
+            import warnings
+
+            warnings.warn(
+                "HTML document is too deeply nested for markdown conversion "
+                "(RecursionError). Falling back to plain-text extraction.",
+                stacklevel=2,
+            )
+            target = body_elm if body_elm else soup
+            webpage_text = target.get_text("\n", strip=True)
 
         assert isinstance(webpage_text, str)
 
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 8e3acc23d..0fb968f37 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -288,6 +288,41 @@ def test_input_as_strings() -> None:
     assert "# Test" in result.text_content
 
 
+def test_deeply_nested_html_fallback() -> None:
+    """Large, deeply nested HTML should fall back to plain-text extraction
+    instead of silently returning unconverted HTML (issue #1636)."""
+    import warnings
+
+    markitdown = MarkItDown()
+
+    # Build HTML with nesting deep enough to trigger RecursionError in markdownify
+    depth = 500
+    html = "<html><body>"
+    for _ in range(depth):
+        html += '<div style="margin-left:10px">'
+    html += "<p>Deep content with <b>bold text</b></p>"
+    for _ in range(depth):
+        html += "</div>"
+    html += "</body></html>"
+
+    with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        result = markitdown.convert_stream(
+            io.BytesIO(html.encode("utf-8")),
+            file_extension=".html",
+        )
+
+        # Should have emitted a warning about the fallback
+        recursion_warnings = [x for x in w if "deeply nested" in str(x.message)]
+        assert len(recursion_warnings) > 0
+
+    # The output should contain the text content, not raw HTML
+    assert "Deep content" in result.text_content
+    assert "bold text" in result.text_content
+    assert "<div" not in result.text_content
+    assert "<p>" not in result.text_content
+
+
 def test_doc_rlink() -> None:
     # Test for: CVE-2025-11849
     markitdown = MarkItDown()

From 4b79faedc6f8a5e0b1e83d2f395918615252038a Mon Sep 17 00:00:00 2001
From: jigangz <115519042+jigangz@users.noreply.github.com>
Date: Sun, 5 Apr 2026 00:25:22 -0700
Subject: [PATCH 2/3] refactor: address review feedback on RecursionError
 fallback

- Move 'import warnings' to module top level (was inside except block)
- Make test environment-independent by temporarily lowering
  sys.setrecursionlimit(200) instead of relying on depth=500 being
  sufficient on all platforms; original limit restored in finally block
- Add strict=True keyword argument to opt out of the plain-text
  fallback and let RecursionError propagate to the caller
---
 .../markitdown/converters/_html_converter.py  |  9 ++++-
 packages/markitdown/tests/test_module_misc.py | 37 +++++++++++++------
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/packages/markitdown/src/markitdown/converters/_html_converter.py b/packages/markitdown/src/markitdown/converters/_html_converter.py
index f95480d2c..029b27f57 100644
--- a/packages/markitdown/src/markitdown/converters/_html_converter.py
+++ b/packages/markitdown/src/markitdown/converters/_html_converter.py
@@ -1,4 +1,5 @@
 import io
+import warnings
 from typing import Any, BinaryIO, Optional
 from bs4 import BeautifulSoup
 
@@ -44,6 +45,10 @@ def convert(
         stream_info: StreamInfo,
         **kwargs: Any,  # Options to pass to the converter
     ) -> DocumentConverterResult:
+        # Pop our own keyword before forwarding the rest to markdownify.
+        # strict=True raises RecursionError instead of falling back to plain text.
+        strict: bool = kwargs.pop("strict", False)
+
         # Parse the stream
         encoding = "utf-8" if stream_info.charset is None else stream_info.charset
         soup = BeautifulSoup(file_stream, "html.parser", from_encoding=encoding)
@@ -61,12 +66,12 @@ def convert(
             else:
                 webpage_text = _CustomMarkdownify(**kwargs).convert_soup(soup)
         except RecursionError:
+            if strict:
+                raise
             # Large or deeply-nested HTML can exceed Python's recursion limit
             # during markdownify's recursive DOM traversal.  Fall back to
             # BeautifulSoup's iterative get_text() so the caller still gets
             # usable plain-text content instead of raw HTML.
-            import warnings
-
             warnings.warn(
                 "HTML document is too deeply nested for markdown conversion "
                 "(RecursionError). Falling back to plain-text extraction.",
diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index 0fb968f37..fceafe390 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -290,12 +290,23 @@ def test_input_as_strings() -> None:
 
 def test_deeply_nested_html_fallback() -> None:
     """Large, deeply nested HTML should fall back to plain-text extraction
-    instead of silently returning unconverted HTML (issue #1636)."""
+    instead of silently returning unconverted HTML (issue #1636).
+
+    Note: This test uses sys.setrecursionlimit to guarantee a RecursionError
+    regardless of the host environment's default limit, making it deterministic
+    across different platforms and CI configurations.
+    """
+    import sys
     import warnings
 
     markitdown = MarkItDown()
 
-    # Build HTML with nesting deep enough to trigger RecursionError in markdownify
+    # Use a small recursion limit so the test is environment-independent.
+    # We restore the original limit in a finally block to avoid side-effects.
+    original_limit = sys.getrecursionlimit()
+    low_limit = 200  # well below markdownify's traversal depth for depth=500
+
+    # Build HTML with nesting deep enough to trigger RecursionError
     depth = 500
     html = "<html><body>"
     for _ in range(depth):
@@ -305,16 +316,20 @@ def test_deeply_nested_html_fallback() -> None:
         html += "</div>"
     html += "</body></html>"
 
-    with warnings.catch_warnings(record=True) as w:
-        warnings.simplefilter("always")
-        result = markitdown.convert_stream(
-            io.BytesIO(html.encode("utf-8")),
-            file_extension=".html",
-        )
+    try:
+        sys.setrecursionlimit(low_limit)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            result = markitdown.convert_stream(
+                io.BytesIO(html.encode("utf-8")),
+                file_extension=".html",
+            )
 
-        # Should have emitted a warning about the fallback
-        recursion_warnings = [x for x in w if "deeply nested" in str(x.message)]
-        assert len(recursion_warnings) > 0
+            # Should have emitted a warning about the fallback
+            recursion_warnings = [x for x in w if "deeply nested" in str(x.message)]
+            assert len(recursion_warnings) > 0
+    finally:
+        sys.setrecursionlimit(original_limit)
 
     # The output should contain the text content, not raw HTML
     assert "Deep content" in result.text_content

From c36cb78b1cd7434b231159e99c927ea9a8a1d1d7 Mon Sep 17 00:00:00 2001
From: jigangz <115519042+jigangz@users.noreply.github.com>
Date: Mon, 6 Apr 2026 12:42:01 -0700
Subject: [PATCH 3/3] test: use result.markdown instead of deprecated
 result.text_content

---
 packages/markitdown/tests/test_module_misc.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
index fceafe390..4d62e4919 100644
--- a/packages/markitdown/tests/test_module_misc.py
+++ b/packages/markitdown/tests/test_module_misc.py
@@ -332,10 +332,10 @@ def test_deeply_nested_html_fallback() -> None:
         sys.setrecursionlimit(original_limit)
 
     # The output should contain the text content, not raw HTML
-    assert "Deep content" in result.text_content
-    assert "bold text" in result.text_content
-    assert "<div" not in result.text_content
-    assert "<p>" not in result.text_content
+    assert "Deep content" in result.markdown
+    assert "bold text" in result.markdown
+    assert "<div" not in result.markdown
+    assert "<p>" not in result.markdown
 
 
 def test_doc_rlink() -> None: