Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion lightrag/external_parser/docling/ir_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,11 @@ def _bump_has_body() -> None:

def _handle_table(item: dict) -> None:
table = _build_ir_table(item, ref_index)
if table is None:
# Empty body — _build_ir_table already logged the drop.
# Skip placeholder allocation and position recording so the
# body-less table item leaves no trace in the IR.
return
placeholder = _next_key("tb")
table.placeholder_key = placeholder
cb_tables.append(table)
Expand Down Expand Up @@ -697,13 +702,28 @@ def _resolve_text_refs(refs: Any, ref_index: dict[str, dict]) -> list[str]:
def _build_ir_table(
item: dict,
ref_index: dict[str, dict],
) -> IRTable:
) -> IRTable | None:
data = item.get("data") or {}
grid = data.get("grid") if isinstance(data, dict) else None
rows = _rows_from_grid(grid)
if not rows and isinstance(data, dict) and data.get("table_cells"):
rows = _rows_from_table_cells(data)

# Docling never populates IRTable.html, so a table without visible row
# content would land in the sidecar as ``content=""`` and trip the
# analyze worker's "missing table content" path (mirrors the MinerU
# filter in lightrag/external_parser/mineru/ir_builder.py). Drop the
# item up here so the IR stays clean.
if not _table_rows_have_content(rows):
logger.info(
"[docling_ir_builder] dropping empty table item "
"(self_ref=%s, num_rows=%s, num_cols=%s)",
item.get("self_ref"),
data.get("num_rows") if isinstance(data, dict) else None,
data.get("num_cols") if isinstance(data, dict) else None,
)
return None

num_rows = (
int(data.get("num_rows") or len(rows) or 0)
if isinstance(data, dict)
Expand Down Expand Up @@ -741,6 +761,15 @@ def _build_ir_table(
)


def _table_rows_have_content(rows: list[list[str]]) -> bool:
"""True iff at least one cell carries visible text."""
for row in rows:
for cell in row:
if isinstance(cell, str) and cell.strip():
return True
return False


def _rows_from_grid(grid: Any) -> list[list[str]]:
out: list[list[str]] = []
if not isinstance(grid, list):
Expand Down
33 changes: 32 additions & 1 deletion lightrag/external_parser/mineru/ir_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,11 @@ def _merge_heading_as_body(heading: str, level: int) -> None:

if item_type == "table":
table = self._build_ir_table(item)
if table is None:
# Empty body — _build_ir_table already logged the drop.
# Skip placeholder allocation and position recording so
# the misidentified item leaves no trace in the IR.
continue
placeholder = _next_key("tb")
table.placeholder_key = placeholder
table.self_ref = _content_list_self_ref(item_index)
Expand Down Expand Up @@ -407,7 +412,7 @@ def _merge_heading_as_body(heading: str, level: int) -> None:
# Tables / drawings
# ------------------------------------------------------------------

def _build_ir_table(self, item: dict) -> IRTable:
def _build_ir_table(self, item: dict) -> IRTable | None:
rows: list[list[str]] | None = None
html: str | None = None
body_field = item.get("rows")
Expand All @@ -433,6 +438,20 @@ def _build_ir_table(self, item: dict) -> IRTable:
else:
html = json.dumps(body, ensure_ascii=False)

# MinerU occasionally emits table items with no usable body (e.g. when
# a page number or blank region is misidentified as a table). Dropping
# them here keeps the sidecar free of items that would later trip the
# analyze worker's "missing table content" hard-failure path.
if not _ir_table_body_has_content(rows, html):
logger.debug(
"[mineru_ir_builder] dropping empty table item "
"(body type=%s, num_rows=%s, num_cols=%s)",
type(body).__name__,
item.get("num_rows"),
item.get("num_cols"),
)
return None

num_rows = int(item.get("num_rows") or (len(rows) if rows else 0) or 0)
num_cols_default = max((len(r) for r in rows), default=0) if rows else 0
num_cols = int(item.get("num_cols") or num_cols_default or 0)
Expand Down Expand Up @@ -577,6 +596,18 @@ def _normalize_grid(grid: Any) -> list[list[str]]:
return out


def _ir_table_body_has_content(rows: list[list[str]] | None, html: str | None) -> bool:
"""True iff the parsed table body carries any visible cell text or HTML."""
if html and html.strip():
return True
if rows:
for row in rows:
for cell in row:
if isinstance(cell, str) and cell.strip():
return True
return False


def _is_block_equation(item: dict) -> bool:
"""Heuristic: MinerU's ``text_format`` distinguishes block vs inline.

Expand Down
14 changes: 14 additions & 0 deletions lightrag/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -3728,6 +3728,20 @@ async def _analyze_text_modality(
)
content_text = _normalize_text(item.get("content"))
if not content_text:
if kind == "table":
# Defensive fallback for sidecars that still carry
# empty-bodied table items (e.g. produced by an older
# parser run, or by a parser that doesn't filter
# MinerU-style misidentified blanks). Don't abort the
# whole worker — record the skip and move on.
logger.warning(
f"[analyze_multimodal] table/{item_id}: missing "
f"table content; skipping analysis ({file_path})"
)
return (
_skipped_result("missing table content"),
None,
)
raise MultimodalAnalysisError(
f"{kind}/{item_id}: missing {kind} content"
)
Expand Down
73 changes: 73 additions & 0 deletions tests/external_parser/docling/test_ir_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,79 @@ def test_docling_adapter_table_grid_and_header(tmp_path: Path) -> None:
assert table.self_ref == "#/tables/0"


def test_docling_adapter_empty_table_dropped(tmp_path: Path) -> None:
"""Table items with no usable body MUST NOT enter the IR.

Docling never populates ``IRTable.html``, so a body-less table would
land in the sidecar as ``content=""`` and trip the analyze worker's
"missing table content" path. Mirrors the MinerU-side filter in
lightrag/external_parser/mineru/ir_builder.py.
"""
# Four shapes of "no visible content" — all must be dropped.
tables = [
# 1) ``data`` missing entirely.
{"self_ref": "#/tables/0", "label": "table", "content_layer": "body"},
# 2) Empty grid.
{
"self_ref": "#/tables/1",
"label": "table",
"content_layer": "body",
"data": {"num_rows": 0, "num_cols": 0, "grid": []},
},
# 3) Grid with only blank cell text.
{
"self_ref": "#/tables/2",
"label": "table",
"content_layer": "body",
"data": {
"num_rows": 1,
"num_cols": 2,
"grid": [[{"text": ""}, {"text": " "}]],
},
},
# 4) table_cells fallback yields a blank grid.
{
"self_ref": "#/tables/3",
"label": "table",
"content_layer": "body",
"data": {
"num_rows": 1,
"num_cols": 1,
"table_cells": [
{
"text": "",
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
}
],
},
},
]
texts = [_text_item(label="text", text="kept", self_ref="#/texts/0")]
raw_dir = _write_doc(
tmp_path,
_doc(
body_children=[
"#/tables/0",
"#/tables/1",
"#/tables/2",
"#/tables/3",
"#/texts/0",
],
texts=texts,
tables=tables,
),
)
ir = DoclingIRBuilder().normalize_from_workdir(raw_dir, document_name="demo.pdf")
table_count = sum(len(b.tables) for b in ir.blocks)
assert table_count == 0
joined = "\n".join(b.content_template for b in ir.blocks)
assert "TBL:" not in joined
assert "kept" in joined


def test_docling_adapter_table_extras_is_empty(tmp_path: Path) -> None:
"""`IRTable.extras` is intentionally left blank by the docling adapter:
the historical ``parent`` / ``children_refs`` / ``references`` /
Expand Down
34 changes: 34 additions & 0 deletions tests/external_parser/mineru/test_ir_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,40 @@ def test_adapter_empty_equation_dropped(tmp_path: Path) -> None:
assert any(b.content_template == "kept" for b in ir.blocks)


@pytest.mark.offline
def test_adapter_empty_table_dropped(tmp_path: Path) -> None:
"""Table items with no usable body MUST NOT enter the IR.

MinerU sometimes misidentifies a page-number / blank region as a table
and emits a body-less ``table`` item (missing ``table_body``/``rows``,
or with an empty string / empty grid). Leaving such items in the IR
would later trip the analyze worker's hard-failure path on empty
``content``. The IR builder filters them upstream.
"""
raw = _write_bundle(
tmp_path,
[
# 1) Body field completely absent.
{"type": "table", "num_rows": 0, "num_cols": 0},
# 2) Empty string body (matches the real m012-manual.pdf bug).
{"type": "table", "table_body": ""},
# 3) Empty list body.
{"type": "table", "rows": []},
# 4) Grid with only blank cells.
{"type": "table", "rows": [["", " "], ["\t", ""]]},
# 5) A real text item so the IR is not entirely empty.
{"type": "text", "text": "kept"},
],
)
ir = MinerUIRBuilder().normalize_from_workdir(raw, document_name="t.pdf")
table_count = sum(len(b.tables) for b in ir.blocks)
assert table_count == 0
# No table placeholder should leak into the rendered content either.
joined = "\n".join(b.content_template for b in ir.blocks)
assert "TBL:" not in joined
assert "kept" in joined


@pytest.mark.offline
def test_adapter_bbox_attributes_default_and_override(tmp_path: Path) -> None:
raw = _write_bundle(tmp_path, [{"type": "text", "text": "x"}])
Expand Down
Loading