From 5369a908f5e300a850c0d678616acd3304aa1917 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 19 May 2026 01:01:04 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=F0=9F=90=9B=20fix(mineru):=20drop=20empty?= =?UTF-8?q?=20table=20items=20to=20prevent=20analyze=20worker=20hard-failu?= =?UTF-8?q?re?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add `_ir_table_body_has_content` helper to detect tables with no visible content - filter out misidentified table items in `_build_ir_table` before IR insertion - add defensive fallback in `analyze_multimodal` for sidecars with empty-bodied tables - add test covering absent, empty string, empty list, and blank-cell table bodies --- lightrag/external_parser/mineru/ir_builder.py | 33 +++++++++++++++++- lightrag/pipeline.py | 14 ++++++++ .../external_parser/mineru/test_ir_builder.py | 34 +++++++++++++++++++ 3 files changed, 80 insertions(+), 1 deletion(-) diff --git a/lightrag/external_parser/mineru/ir_builder.py b/lightrag/external_parser/mineru/ir_builder.py index 7e6654d39a..e9ecbfbd44 100644 --- a/lightrag/external_parser/mineru/ir_builder.py +++ b/lightrag/external_parser/mineru/ir_builder.py @@ -354,6 +354,11 @@ def _merge_heading_as_body(heading: str, level: int) -> None: if item_type == "table": table = self._build_ir_table(item) + if table is None: + # Empty body — _build_ir_table already logged the drop. + # Skip placeholder allocation and position recording so + # the misidentified item leaves no trace in the IR. + continue placeholder = _next_key("tb") table.placeholder_key = placeholder table.self_ref = _content_list_self_ref(item_index) @@ -407,7 +412,7 @@ def _merge_heading_as_body(heading: str, level: int) -> None: # Tables / drawings # ------------------------------------------------------------------ - def _build_ir_table(self, item: dict) -> IRTable: + def _build_ir_table(self, item: dict) -> IRTable | None: rows: list[list[str]] | None = None html: str | None = None body_field = item.get("rows") @@ -433,6 +438,20 @@ def _build_ir_table(self, item: dict) -> IRTable: else: html = json.dumps(body, ensure_ascii=False) + # MinerU occasionally emits table items with no usable body (e.g. when + # a page number or blank region is misidentified as a table). Dropping + # them here keeps the sidecar free of items that would later trip the + # analyze worker's "missing table content" hard-failure path. + if not _ir_table_body_has_content(rows, html): + logger.debug( + "[mineru_ir_builder] dropping empty table item " + "(body type=%s, num_rows=%s, num_cols=%s)", + type(body).__name__, + item.get("num_rows"), + item.get("num_cols"), + ) + return None + num_rows = int(item.get("num_rows") or (len(rows) if rows else 0) or 0) num_cols_default = max((len(r) for r in rows), default=0) if rows else 0 num_cols = int(item.get("num_cols") or num_cols_default or 0) @@ -577,6 +596,18 @@ def _normalize_grid(grid: Any) -> list[list[str]]: return out +def _ir_table_body_has_content(rows: list[list[str]] | None, html: str | None) -> bool: + """True iff the parsed table body carries any visible cell text or HTML.""" + if html and html.strip(): + return True + if rows: + for row in rows: + for cell in row: + if isinstance(cell, str) and cell.strip(): + return True + return False + + def _is_block_equation(item: dict) -> bool: """Heuristic: MinerU's ``text_format`` distinguishes block vs inline. diff --git a/lightrag/pipeline.py b/lightrag/pipeline.py index 085e631f89..d1101808a1 100644 --- a/lightrag/pipeline.py +++ b/lightrag/pipeline.py @@ -3728,6 +3728,20 @@ async def _analyze_text_modality( ) content_text = _normalize_text(item.get("content")) if not content_text: + if kind == "table": + # Defensive fallback for sidecars that still carry + # empty-bodied table items (e.g. produced by an older + # parser run, or by a parser that doesn't filter + # MinerU-style misidentified blanks). Don't abort the + # whole worker — record the skip and move on. + logger.warning( + f"[analyze_multimodal] table/{item_id}: missing " + f"table content; skipping analysis ({file_path})" + ) + return ( + _skipped_result("missing table content"), + None, + ) raise MultimodalAnalysisError( f"{kind}/{item_id}: missing {kind} content" ) diff --git a/tests/external_parser/mineru/test_ir_builder.py b/tests/external_parser/mineru/test_ir_builder.py index e916cabe3e..766efb51f7 100644 --- a/tests/external_parser/mineru/test_ir_builder.py +++ b/tests/external_parser/mineru/test_ir_builder.py @@ -457,6 +457,40 @@ def test_adapter_empty_equation_dropped(tmp_path: Path) -> None: assert any(b.content_template == "kept" for b in ir.blocks) +@pytest.mark.offline +def test_adapter_empty_table_dropped(tmp_path: Path) -> None: + """Table items with no usable body MUST NOT enter the IR. + + MinerU sometimes misidentifies a page-number / blank region as a table + and emits a body-less ``table`` item (missing ``table_body``/``rows``, + or with an empty string / empty grid). Leaving such items in the IR + would later trip the analyze worker's hard-failure path on empty + ``content``. The IR builder filters them upstream. + """ + raw = _write_bundle( + tmp_path, + [ + # 1) Body field completely absent. + {"type": "table", "num_rows": 0, "num_cols": 0}, + # 2) Empty string body (matches the real m012-manual.pdf bug). + {"type": "table", "table_body": ""}, + # 3) Empty list body. + {"type": "table", "rows": []}, + # 4) Grid with only blank cells. + {"type": "table", "rows": [["", " "], ["\t", ""]]}, + # 5) A real text item so the IR is not entirely empty. + {"type": "text", "text": "kept"}, + ], + ) + ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="t.pdf") + table_count = sum(len(b.tables) for b in ir.blocks) + assert table_count == 0 + # No table placeholder should leak into the rendered content either. + joined = "\n".join(b.content_template for b in ir.blocks) + assert "TBL:" not in joined + assert "kept" in joined + + @pytest.mark.offline def test_adapter_bbox_attributes_default_and_override(tmp_path: Path) -> None: raw = _write_bundle(tmp_path, [{"type": "text", "text": "x"}]) From 6fb0b51a6bb8bc5d297433b7ce185efa4bc11179 Mon Sep 17 00:00:00 2001 From: yangdx Date: Tue, 19 May 2026 01:10:00 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20refactor(docling):=20d?= =?UTF-8?q?rop=20empty=20tables=20from=20IR=20to=20prevent=20analyze=20wor?= =?UTF-8?q?ker=20failures?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - add `_table_rows_have_content` helper to detect visible cell text - return `None` from `_build_ir_table` when table has no content - skip placeholder allocation in `_handle_table` for dropped tables - mirror existing MinerU-side empty table filter behavior - add test covering four shapes of empty table input --- .../external_parser/docling/ir_builder.py | 31 +++++++- .../docling/test_ir_builder.py | 73 +++++++++++++++++++ 2 files changed, 103 insertions(+), 1 deletion(-) diff --git a/lightrag/external_parser/docling/ir_builder.py b/lightrag/external_parser/docling/ir_builder.py index 06b24b593d..7e1c4428f0 100644 --- a/lightrag/external_parser/docling/ir_builder.py +++ b/lightrag/external_parser/docling/ir_builder.py @@ -447,6 +447,11 @@ def _bump_has_body() -> None: def _handle_table(item: dict) -> None: table = _build_ir_table(item, ref_index) + if table is None: + # Empty body — _build_ir_table already logged the drop. + # Skip placeholder allocation and position recording so the + # body-less table item leaves no trace in the IR. + return placeholder = _next_key("tb") table.placeholder_key = placeholder cb_tables.append(table) @@ -697,13 +702,28 @@ def _resolve_text_refs(refs: Any, ref_index: dict[str, dict]) -> list[str]: def _build_ir_table( item: dict, ref_index: dict[str, dict], -) -> IRTable: +) -> IRTable | None: data = item.get("data") or {} grid = data.get("grid") if isinstance(data, dict) else None rows = _rows_from_grid(grid) if not rows and isinstance(data, dict) and data.get("table_cells"): rows = _rows_from_table_cells(data) + # Docling never populates IRTable.html, so a table without visible row + # content would land in the sidecar as ``content=""`` and trip the + # analyze worker's "missing table content" path (mirrors the MinerU + # filter in lightrag/external_parser/mineru/ir_builder.py). Drop the + # item up here so the IR stays clean. + if not _table_rows_have_content(rows): + logger.info( + "[docling_ir_builder] dropping empty table item " + "(self_ref=%s, num_rows=%s, num_cols=%s)", + item.get("self_ref"), + data.get("num_rows") if isinstance(data, dict) else None, + data.get("num_cols") if isinstance(data, dict) else None, + ) + return None + num_rows = ( int(data.get("num_rows") or len(rows) or 0) if isinstance(data, dict) @@ -741,6 +761,15 @@ def _build_ir_table( ) +def _table_rows_have_content(rows: list[list[str]]) -> bool: + """True iff at least one cell carries visible text.""" + for row in rows: + for cell in row: + if isinstance(cell, str) and cell.strip(): + return True + return False + + def _rows_from_grid(grid: Any) -> list[list[str]]: out: list[list[str]] = [] if not isinstance(grid, list): diff --git a/tests/external_parser/docling/test_ir_builder.py b/tests/external_parser/docling/test_ir_builder.py index 2861dea69c..6797bb15cf 100644 --- a/tests/external_parser/docling/test_ir_builder.py +++ b/tests/external_parser/docling/test_ir_builder.py @@ -486,6 +486,79 @@ def test_docling_adapter_table_grid_and_header(tmp_path: Path) -> None: assert table.self_ref == "#/tables/0" +def test_docling_adapter_empty_table_dropped(tmp_path: Path) -> None: + """Table items with no usable body MUST NOT enter the IR. + + Docling never populates ``IRTable.html``, so a body-less table would + land in the sidecar as ``content=""`` and trip the analyze worker's + "missing table content" path. Mirrors the MinerU-side filter in + lightrag/external_parser/mineru/ir_builder.py. + """ + # Four shapes of "no visible content" — all must be dropped. + tables = [ + # 1) ``data`` missing entirely. + {"self_ref": "#/tables/0", "label": "table", "content_layer": "body"}, + # 2) Empty grid. + { + "self_ref": "#/tables/1", + "label": "table", + "content_layer": "body", + "data": {"num_rows": 0, "num_cols": 0, "grid": []}, + }, + # 3) Grid with only blank cell text. + { + "self_ref": "#/tables/2", + "label": "table", + "content_layer": "body", + "data": { + "num_rows": 1, + "num_cols": 2, + "grid": [[{"text": ""}, {"text": " "}]], + }, + }, + # 4) table_cells fallback yields a blank grid. + { + "self_ref": "#/tables/3", + "label": "table", + "content_layer": "body", + "data": { + "num_rows": 1, + "num_cols": 1, + "table_cells": [ + { + "text": "", + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + } + ], + }, + }, + ] + texts = [_text_item(label="text", text="kept", self_ref="#/texts/0")] + raw_dir = _write_doc( + tmp_path, + _doc( + body_children=[ + "#/tables/0", + "#/tables/1", + "#/tables/2", + "#/tables/3", + "#/texts/0", + ], + texts=texts, + tables=tables, + ), + ) + ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf") + table_count = sum(len(b.tables) for b in ir.blocks) + assert table_count == 0 + joined = "\n".join(b.content_template for b in ir.blocks) + assert "TBL:" not in joined + assert "kept" in joined + + def test_docling_adapter_table_extras_is_empty(tmp_path: Path) -> None: """`IRTable.extras` is intentionally left blank by the docling adapter: the historical ``parent`` / ``children_refs`` / ``references`` /