From 31e927e0cea079b213560fb867715fa35fe47ef8 Mon Sep 17 00:00:00 2001 From: drr00t Date: Sun, 17 May 2026 17:10:05 -0300 Subject: [PATCH 1/8] feat: initial mult-extension loader --- graphrag_sdk/pyproject.toml | 2 + graphrag_sdk/src/graphrag_sdk/api/main.py | 15 ++ .../ingestion/loaders/__init__.py | 17 +- .../ingestion/loaders/csv_loader.py | 12 ++ .../ingestion/loaders/docling_base.py | 153 ++++++++++++++++++ .../ingestion/loaders/docx_loader.py | 12 ++ .../ingestion/loaders/html_loader.py | 12 ++ .../ingestion/loaders/pptx_loader.py | 12 ++ .../ingestion/loaders/xlsx_loader.py | 12 ++ graphrag_sdk/tests/test_docling_loaders.py | 123 ++++++++++++++ 10 files changed, 369 insertions(+), 1 deletion(-) create mode 100644 graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py create mode 100644 graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py create mode 100644 graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py create mode 100644 graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py create mode 100644 graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py create mode 100644 graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py create mode 100644 graphrag_sdk/tests/test_docling_loaders.py diff --git a/graphrag_sdk/pyproject.toml b/graphrag_sdk/pyproject.toml index be8597fd..042dff15 100644 --- a/graphrag_sdk/pyproject.toml +++ b/graphrag_sdk/pyproject.toml @@ -56,6 +56,7 @@ litellm = ["litellm>=1.83.0,<2.0"] openrouter = ["openai>=1.0,<2.0"] fastcoref = ["fastcoref>=2.0"] spacy = ["spacy>=3.0"] +docling = ["docling>=2.91.0"] all = [ "openai>=1.0,<2.0", "anthropic>=0.20,<1.0", @@ -63,6 +64,7 @@ all = [ "sentence-transformers>=2.0", "pypdf>=6.9.2", "litellm>=1.83.0,<2.0", + "docling>=2.0.0", ] dev = [ "pytest>=8.0", diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py index 9b2486c6..e20c90c9 100644 --- a/graphrag_sdk/src/graphrag_sdk/api/main.py +++ b/graphrag_sdk/src/graphrag_sdk/api/main.py @@ -38,6 +38,11 @@ from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader from graphrag_sdk.ingestion.loaders.text_loader import TextLoader +from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader +from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader +from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader +from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader +from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader from graphrag_sdk.ingestion.pipeline import IngestionPipeline from graphrag_sdk.ingestion.resolution_strategies.base import ResolutionStrategy from graphrag_sdk.ingestion.resolution_strategies.exact_match import ExactMatchResolution @@ -632,6 +637,16 @@ def _default_loader_for(source: str) -> LoaderStrategy: return PdfLoader() if lower.endswith(".md"): return MarkdownLoader() + if lower.endswith(".docx"): + return DocxLoader() + if lower.endswith(".xlsx"): + return XlsxLoader() + if lower.endswith(".pptx"): + return PptxLoader() + if lower.endswith(".html") or lower.endswith(".xhtml"): + return HtmlLoader() + if lower.endswith(".csv"): + return CsvLoader() return TextLoader() # ── Incremental Updates ───────────────────────────────────── diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py index 9a781bca..8be32cbf 100644 --- a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py @@ -4,5 +4,20 @@ from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader from graphrag_sdk.ingestion.loaders.text_loader import TextLoader +from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader +from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader +from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader +from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader +from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader -__all__ = ["LoaderStrategy", "MarkdownLoader", "PdfLoader", "TextLoader"] +__all__ = [ + "LoaderStrategy", + "MarkdownLoader", + "PdfLoader", + "TextLoader", + "DocxLoader", + "XlsxLoader", + "PptxLoader", + "HtmlLoader", + "CsvLoader", +] diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py new file mode 100644 index 00000000..7bb7802d --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: CSV Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class CsvLoader(DoclingBaseLoader): + """Load text and structural elements from a CSV file using Docling.""" + + @property + def extension_name(self) -> str: + return "csv" diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py new file mode 100644 index 00000000..12d2ad3d --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py @@ -0,0 +1,153 @@ +# GraphRAG SDK — Ingestion: Docling Base Loader +# Pattern: Strategy Base Class + +from __future__ import annotations + +import asyncio +import logging +from pathlib import Path +from typing import Any + +from graphrag_sdk.core.context import Context +from graphrag_sdk.core.exceptions import LoaderError +from graphrag_sdk.core.models import DocumentElement, DocumentInfo, DocumentOutput +from graphrag_sdk.ingestion.loaders.base import LoaderStrategy + +logger = logging.getLogger(__name__) + + +class DoclingBaseLoader(LoaderStrategy): + """Base loader using docling for advanced document parsing. + + Subclasses should define the `extension_name` property. + """ + + def __init__(self, **docling_kwargs: Any) -> None: + """Initialize the loader. + + Args: + **docling_kwargs: Arbitrary keyword arguments passed to + `docling.document_converter.DocumentConverter` (e.g., pipeline_options). + """ + self.docling_kwargs = docling_kwargs + + @property + def extension_name(self) -> str: + return "unknown" + + async def load(self, source: str, ctx: Context) -> DocumentOutput: + ctx.log(f"Loading {self.extension_name.upper()} file via docling: {source}") + # Run synchronous docling extraction in a non-blocking thread + return await asyncio.to_thread(self._load_sync, source) + + def _load_sync(self, source: str) -> DocumentOutput: + path = Path(source) + if not path.exists(): + raise LoaderError(f"File not found: {source}") + + try: + from docling.document_converter import DocumentConverter + from docling.datamodel.document import DocItemLabel + except ImportError: + raise LoaderError( + f"{self.extension_name.upper()} parsing requires 'docling'. Install with:\n" + " pip install graphrag-sdk[docling]" + ) + + try: + converter = DocumentConverter(**self.docling_kwargs) + result = converter.convert(source) + doc = result.document + except Exception as exc: + raise LoaderError(f"Docling failed to process {source}: {exc}") from exc + + elements: list[DocumentElement] = [] + current_breadcrumbs: list[tuple[int, str]] = [] + full_text_blocks = [] + + # Map docling hierarchy to GraphRAG DocumentElements + for item, level in doc.iterate_items(): + content = getattr(item, "text", "") + if not content and hasattr(item, "export_to_markdown"): + try: + content = item.export_to_markdown() + except Exception: + pass + + if not content: + continue + + full_text_blocks.append(content) + label = getattr(item, "label", None) + + if label in (DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER): + # Update breadcrumbs + while current_breadcrumbs and current_breadcrumbs[-1][0] >= level: + current_breadcrumbs.pop() + current_breadcrumbs.append((level, content)) + + elements.append( + DocumentElement( + type="header", + level=level, + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + elif label in (DocItemLabel.PARAGRAPH, DocItemLabel.TEXT): + elements.append( + DocumentElement( + type="paragraph", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + elif label == DocItemLabel.LIST_ITEM: + elements.append( + DocumentElement( + type="list", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + elif label == DocItemLabel.TABLE: + elements.append( + DocumentElement( + type="table", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + elif label == DocItemLabel.CODE: + elements.append( + DocumentElement( + type="code", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + else: + # Default for CAPTION, FOOTNOTE, etc. + elements.append( + DocumentElement( + type="paragraph", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + metadata={"label": label.value if hasattr(label, "value") else label}, + ) + ) + + full_text = "\n\n".join(full_text_blocks) + + return DocumentOutput( + text=full_text, + document_info=DocumentInfo( + path=str(path), + metadata={ + "size_bytes": path.stat().st_size, + "loader": self.extension_name, + "suffix": path.suffix, + }, + ), + elements=elements, + ) diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py new file mode 100644 index 00000000..731a5149 --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: DOCX Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class DocxLoader(DoclingBaseLoader): + """Load text and structural elements from a DOCX file using Docling.""" + + @property + def extension_name(self) -> str: + return "docx" diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py new file mode 100644 index 00000000..906b50a9 --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: HTML Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class HtmlLoader(DoclingBaseLoader): + """Load text and structural elements from an HTML/XHTML file using Docling.""" + + @property + def extension_name(self) -> str: + return "html" diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py new file mode 100644 index 00000000..a2a6fcb0 --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: PPTX Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class PptxLoader(DoclingBaseLoader): + """Load text and structural elements from a PPTX file using Docling.""" + + @property + def extension_name(self) -> str: + return "pptx" diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py new file mode 100644 index 00000000..3b09e9b8 --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: XLSX Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class XlsxLoader(DoclingBaseLoader): + """Load text and structural elements from an XLSX file using Docling.""" + + @property + def extension_name(self) -> str: + return "xlsx" diff --git a/graphrag_sdk/tests/test_docling_loaders.py b/graphrag_sdk/tests/test_docling_loaders.py new file mode 100644 index 00000000..5aee0f66 --- /dev/null +++ b/graphrag_sdk/tests/test_docling_loaders.py @@ -0,0 +1,123 @@ +import pytest +from unittest.mock import MagicMock, patch +from graphrag_sdk.core.exceptions import LoaderError +from graphrag_sdk.core.models import DocumentOutput, DocumentInfo, DocumentElement +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + +class TestDoclingBaseLoader: + """Tests for DoclingBaseLoader and its derived loaders.""" + + def test_import_error_wrapped_in_loader_error(self, ctx, tmp_path, monkeypatch): + """Verify that ImportError when docling is missing is wrapped in LoaderError.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + loader = DoclingBaseLoader() + loader.extension_name = "docx" + + # Mocking the import to raise ImportError + with patch("builtins.__import__", side_effect=lambda name, *args, **kwargs: + (exec('raise ImportError("module not found")') if name == "docling.document_converter" else None)): + with pytest.raises(LoaderError, match=r"DOCX parsing requires 'docling'"): + import asyncio + asyncio.run(loader.load(str(file), ctx)) + + async def test_label_mapping_and_metadata_preservation(self, ctx, tmp_path, monkeypatch): + """Verify mapping of DocItemLabel and preservation of labels in metadata for fallback cases.""" + from docling.datamodel.document import DocItemLabel + + file = tmp_path / "test.docx" + file.write_text("dummy content") + + # Mock Docling's result structure + mock_item_header = MagicMock() + mock_item_header.label = DocItemLabel.SECTION_HEADER + mock_item_header.text = "Header 1" + + mock_item_para = MagicMock() + mock_item_para.label = DocItemLabel.PARAGRAPH + mock_item_para.text = "Paragraph 1" + + mock_item_footnote = MagicMock() + mock_item_footnote.label = DocItemLabel.FOOTNOTE + mock_item_footnote.text = "Footnote content" + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = [ + (mock_item_header, 1), + (mock_item_para, 2), + (mock_item_footnote, 2), + ] + + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + + monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + + loader = DoclingBaseLoader() + loader.extension_name = "docx" + result = await loader.load(str(file), ctx) + + elements = result.elements + assert len(elements) == 3 + assert elements[0].type == "header" + assert elements[0].content == "Header 1" + assert elements[1].type == "paragraph" + assert elements[1].content == "Paragraph 1" + # Check fallback and metadata preservation + assert elements[2].type == "paragraph" + assert elements[2].content == "Footnote content" + assert elements[2].metadata["label"] == DocItemLabel.FOOTNOTE.value + + async def test_breadcrumbs_construction(self, ctx, tmp_path, monkeypatch): + """Verify the breadcrumbs are built correctly following the header hierarchy.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + from docling.datamodel.document import DocItemLabel + + # Hierarchy: H1 -> H2 -> P -> H2 (new) -> P + items = [ + (MagicMock(label=DocItemLabel.SECTION_HEADER, text="Root"), 1), + (MagicMock(label=DocItemLabel.SECTION_HEADER, text="Child"), 2), + (MagicMock(label=DocItemLabel.PARAGRAPH, text="Text 1"), 3), + (MagicMock(label=DocItemLabel.SECTION_HEADER, text="Sibling"), 2), + (MagicMock(label=DocItemLabel.PARAGRAPH, text="Text 2"), 3), + ] + # Fix texts since we use MagicMocks + for item, _ in items: + item.text = item.text if hasattr(item, 'text') else "" # not needed due to initialization above + + # Redefining properly with actual text + mock_items = [] + mock_items.append((MagicMock(label=DocItemLabel.SECTION_HEADER, text="Root"), 1)) + mock_items.append((MagicMock(label=DocItemLabel.SECTION_HEADER, text="Child"), 2)) + mock_items.append((MagicMock(label=DocItemLabel.PARAGRAPH, text="Text 1"), 3)) + mock_items.append((MagicMock(label=DocItemLabel.SECTION_HEADER, text="Sibling"), 2)) + mock_items.append((MagicMock(label=DocItemLabel.PARAGRAPH, text="Text 2"), 3)) + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = mock_items + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + + loader = DoclingBaseLoader() + loader.extension_name = "docx" + result = await loader.load(str(file), ctx) + + elements = result.elements + assert elements[0].breadcrumbs == ["Root"] + assert elements[1].breadcrumbs == ["Root", "Child"] + assert elements[2].breadcrumbs == ["Root", "Child"] + assert elements[3].breadcrumbs == ["Root", "Sibling"] + assert elements[4].breadcrumbs == ["Root", "Sibling"] + + @pytest.mark.skip(reason="Requires actual docling installation and sample files") + async def test_real_file_loading(self, ctx, tmp_path): + """Carga de arquivos reais ( Integration test ).""" + from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader + from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader + + # This would require actual bytes of docx/xlsx etc. + pass From 6631d8e09cf5bd60f454f814684e7c5523f7da79 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Thu, 14 May 2026 16:24:35 +0300 Subject: [PATCH 2/8] feat(ingestion): default to SentenceTokenCapChunking in ingest()/update() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes the default chunker that ``GraphRAG.ingest()`` and ``GraphRAG.update()`` fall back to when the caller doesn't pass an explicit ``chunker=``. Was ``FixedSizeChunking()``; now ``SentenceTokenCapChunking()`` (sentence-aware, max_tokens=512, overlap_sentences=2 — the strategy's own defaults). Why --- ``FixedSizeChunking`` splits on a hard character window with no awareness of sentence, word, or paragraph boundaries. When the window cuts through an entity name, the per-chunk LLM extractor produces a stub entity for the fragment (``"Wayne Enterprises"`` → ``"Wayne En"`` in chunk N plus unparsable text in chunk N+1). These stubs never merge with their full forms during resolution because their embeddings differ enough that LLMVerifiedResolution scores them below the soft threshold. This silently inflates cypher counts and pollutes "which X" lists. The strategy that surfaced this — ``CypherFirstAggregationStrategy`` — was hitting a 6/7 ceiling on the internal aggregation benchmark with one question failing because of these stubs. Switching to ``SentenceTokenCapChunking`` cleared the benchmark to 7/7 stable across three runs, and the post-ingest graph state went from 11-14 organization nodes (including ``Glo`` / ``Initech System`` / ``Wayne En``) to exactly 10 clean orgs, and from 66-80 ``Person`` nodes (with ``Carla`` / ``Carla Okafor`` duplicates) to exactly 56 distinct persons — matching the corpus. A side benefit: sentence-aware chunks with 2-sentence overlap almost always keep a person's first mention in the same chunk as their later short-form references, so per-chunk FastCoref now binds ``Carla → Carla Okafor`` reliably. That eliminates the short-form-duplicate class too, not just the truncation stubs. Compatibility ------------- ``FixedSizeChunking`` remains exported and fully supported — callers who explicitly pass ``chunker=FixedSizeChunking()`` get unchanged behavior. Existing tests (748 passed, 24 skipped) pass without modification: no test in the suite asserts on chunk count or content shape from the default chunker, so switching defaults doesn't break the suite. Callers who relied on the previous default and want to keep it should pass ``chunker=FixedSizeChunking()`` explicitly. The docstrings call out the new default and reference ``FixedSizeChunking`` as the opt-in character-window alternative. Co-Authored-By: Claude Opus 4.7 (1M context) --- graphrag_sdk/src/graphrag_sdk/api/main.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py index e20c90c9..395cea1a 100644 --- a/graphrag_sdk/src/graphrag_sdk/api/main.py +++ b/graphrag_sdk/src/graphrag_sdk/api/main.py @@ -31,7 +31,9 @@ ) from graphrag_sdk.core.providers import Embedder, LLMInterface from graphrag_sdk.ingestion.chunking_strategies.base import ChunkingStrategy -from graphrag_sdk.ingestion.chunking_strategies.fixed_size import FixedSizeChunking +from graphrag_sdk.ingestion.chunking_strategies.sentence_token_cap import ( + SentenceTokenCapChunking, +) from graphrag_sdk.ingestion.extraction_strategies.base import ExtractionStrategy from graphrag_sdk.ingestion.extraction_strategies.graph_extraction import GraphExtraction from graphrag_sdk.ingestion.loaders.base import LoaderStrategy @@ -325,7 +327,10 @@ async def ingest( Uses sensible defaults for any unspecified strategy: - Loader: auto-detected from file extension (PDF or text) - - Chunker: FixedSizeChunking(chunk_size=1000) + - Chunker: SentenceTokenCapChunking(max_tokens=512, overlap_sentences=2) + — sentence-aware, never splits entity names at chunk boundaries. + Override with ``chunker=FixedSizeChunking(...)`` if you need + character-window chunking. - Extractor: GraphExtraction with configured LLM - Resolver: ExactMatchResolution @@ -534,7 +539,7 @@ async def _ingest_single( pipeline = IngestionPipeline( loader=loader or TextLoader(), - chunker=chunker or FixedSizeChunking(), + chunker=chunker or SentenceTokenCapChunking(), extractor=extractor or self._default_extractor(), resolver=resolver or ExactMatchResolution(), graph_store=self._graph_store, @@ -1025,7 +1030,7 @@ async def update( pipeline = IngestionPipeline( loader=loader or TextLoader(), # unused (text is provided below) - chunker=chunker or FixedSizeChunking(), + chunker=chunker or SentenceTokenCapChunking(), extractor=extractor or self._default_extractor(), resolver=resolver or ExactMatchResolution(), graph_store=self._graph_store, @@ -1286,7 +1291,7 @@ async def apply_changes( to ``ingest()`` and ``update()``). Defaults to per-extension auto-selection. ``deleted`` ignores this. chunker: Override the chunking strategy for ``added``/``modified``. - Defaults to ``FixedSizeChunking``. ``deleted`` ignores this. + Defaults to ``SentenceTokenCapChunking``. ``deleted`` ignores this. extractor: Override the entity-extraction strategy for ``added``/``modified``. ``deleted`` ignores this. resolver: Override the resolution strategy for ``added``/ From 9bfbab41de762bac172c809408aaa3758925801d Mon Sep 17 00:00:00 2001 From: drr00t Date: Mon, 18 May 2026 21:17:23 -0300 Subject: [PATCH 3/8] test(loader): more tests for doclin-base loaders --- graphrag_sdk/src/graphrag_sdk/api/main.py | 8 +- .../ingestion/loaders/__init__.py | 8 +- .../ingestion/loaders/docling_base.py | 9 +- graphrag_sdk/tests/test_docling_loaders.py | 112 +++++++++++++----- 4 files changed, 96 insertions(+), 41 deletions(-) diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py index 395cea1a..b267e380 100644 --- a/graphrag_sdk/src/graphrag_sdk/api/main.py +++ b/graphrag_sdk/src/graphrag_sdk/api/main.py @@ -37,14 +37,14 @@ from graphrag_sdk.ingestion.extraction_strategies.base import ExtractionStrategy from graphrag_sdk.ingestion.extraction_strategies.graph_extraction import GraphExtraction from graphrag_sdk.ingestion.loaders.base import LoaderStrategy +from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader +from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader +from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader +from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader from graphrag_sdk.ingestion.loaders.text_loader import TextLoader -from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader -from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader -from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader -from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader from graphrag_sdk.ingestion.pipeline import IngestionPipeline from graphrag_sdk.ingestion.resolution_strategies.base import ResolutionStrategy from graphrag_sdk.ingestion.resolution_strategies.exact_match import ExactMatchResolution diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py index 8be32cbf..01001905 100644 --- a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py @@ -1,14 +1,14 @@ # GraphRAG SDK — Ingestion: Loaders from graphrag_sdk.ingestion.loaders.base import LoaderStrategy +from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader +from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader +from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader +from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader from graphrag_sdk.ingestion.loaders.text_loader import TextLoader -from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader -from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader -from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader -from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader __all__ = [ "LoaderStrategy", diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py index 12d2ad3d..e03b314a 100644 --- a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py @@ -18,7 +18,7 @@ class DoclingBaseLoader(LoaderStrategy): """Base loader using docling for advanced document parsing. - + Subclasses should define the `extension_name` property. """ @@ -26,8 +26,9 @@ def __init__(self, **docling_kwargs: Any) -> None: """Initialize the loader. Args: - **docling_kwargs: Arbitrary keyword arguments passed to - `docling.document_converter.DocumentConverter` (e.g., pipeline_options). + **docling_kwargs: Arbitrary keyword arguments passed to + `docling.document_converter.DocumentConverter` (e.g., + pipeline_options). """ self.docling_kwargs = docling_kwargs @@ -46,8 +47,8 @@ def _load_sync(self, source: str) -> DocumentOutput: raise LoaderError(f"File not found: {source}") try: - from docling.document_converter import DocumentConverter from docling.datamodel.document import DocItemLabel + from docling.document_converter import DocumentConverter except ImportError: raise LoaderError( f"{self.extension_name.upper()} parsing requires 'docling'. Install with:\n" diff --git a/graphrag_sdk/tests/test_docling_loaders.py b/graphrag_sdk/tests/test_docling_loaders.py index 5aee0f66..6ad75aa8 100644 --- a/graphrag_sdk/tests/test_docling_loaders.py +++ b/graphrag_sdk/tests/test_docling_loaders.py @@ -4,23 +4,28 @@ from graphrag_sdk.core.models import DocumentOutput, DocumentInfo, DocumentElement from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader +class MockDocxLoader(DoclingBaseLoader): + @property + def extension_name(self) -> str: + return "docx" + class TestDoclingBaseLoader: """Tests for DoclingBaseLoader and its derived loaders.""" - def test_import_error_wrapped_in_loader_error(self, ctx, tmp_path, monkeypatch): + @pytest.mark.asyncio + async def test_import_error_wrapped_in_loader_error(self, ctx, tmp_path, monkeypatch): """Verify that ImportError when docling is missing is wrapped in LoaderError.""" file = tmp_path / "test.docx" file.write_text("dummy content") - loader = DoclingBaseLoader() - loader.extension_name = "docx" + loader = MockDocxLoader() # Mocking the import to raise ImportError - with patch("builtins.__import__", side_effect=lambda name, *args, **kwargs: + with patch("builtins.__import__", side_effect=lambda name, *args, **kwargs: (exec('raise ImportError("module not found")') if name == "docling.document_converter" else None)): with pytest.raises(LoaderError, match=r"DOCX parsing requires 'docling'"): - import asyncio - asyncio.run(loader.load(str(file), ctx)) + await loader.load(str(file), ctx) + async def test_label_mapping_and_metadata_preservation(self, ctx, tmp_path, monkeypatch): """Verify mapping of DocItemLabel and preservation of labels in metadata for fallback cases.""" @@ -54,8 +59,7 @@ async def test_label_mapping_and_metadata_preservation(self, ctx, tmp_path, monk monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) - loader = DoclingBaseLoader() - loader.extension_name = "docx" + loader = MockDocxLoader() result = await loader.load(str(file), ctx) elements = result.elements @@ -77,18 +81,6 @@ async def test_breadcrumbs_construction(self, ctx, tmp_path, monkeypatch): from docling.datamodel.document import DocItemLabel # Hierarchy: H1 -> H2 -> P -> H2 (new) -> P - items = [ - (MagicMock(label=DocItemLabel.SECTION_HEADER, text="Root"), 1), - (MagicMock(label=DocItemLabel.SECTION_HEADER, text="Child"), 2), - (MagicMock(label=DocItemLabel.PARAGRAPH, text="Text 1"), 3), - (MagicMock(label=DocItemLabel.SECTION_HEADER, text="Sibling"), 2), - (MagicMock(label=DocItemLabel.PARAGRAPH, text="Text 2"), 3), - ] - # Fix texts since we use MagicMocks - for item, _ in items: - item.text = item.text if hasattr(item, 'text') else "" # not needed due to initialization above - - # Redefining properly with actual text mock_items = [] mock_items.append((MagicMock(label=DocItemLabel.SECTION_HEADER, text="Root"), 1)) mock_items.append((MagicMock(label=DocItemLabel.SECTION_HEADER, text="Child"), 2)) @@ -102,8 +94,7 @@ async def test_breadcrumbs_construction(self, ctx, tmp_path, monkeypatch): mock_converter.convert.return_value.document = mock_doc monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) - loader = DoclingBaseLoader() - loader.extension_name = "docx" + loader = MockDocxLoader() result = await loader.load(str(file), ctx) elements = result.elements @@ -113,11 +104,74 @@ async def test_breadcrumbs_construction(self, ctx, tmp_path, monkeypatch): assert elements[3].breadcrumbs == ["Root", "Sibling"] assert elements[4].breadcrumbs == ["Root", "Sibling"] - @pytest.mark.skip(reason="Requires actual docling installation and sample files") - async def test_real_file_loading(self, ctx, tmp_path): - """Carga de arquivos reais ( Integration test ).""" - from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader - from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader + async def test_file_not_found(self, ctx): + """Verify that LoaderError is raised when the file does not exist.""" + loader = MockDocxLoader() + with pytest.raises(LoaderError, match="File not found"): + await loader.load("/non/existent/path.docx", ctx) + + async def test_docling_conversion_failure(self, ctx, tmp_path, monkeypatch): + """Verify that exceptions during docling conversion are wrapped in LoaderError.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + mock_converter = MagicMock() + mock_converter.convert.side_effect = Exception("Conversion failed") + monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + + loader = MockDocxLoader() + with pytest.raises(LoaderError, match="Docling failed to process"): + await loader.load(str(file), ctx) + + async def test_export_to_markdown_fallback(self, ctx, tmp_path, monkeypatch): + """Verify fallback to export_to_markdown when text attribute is empty.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + from docling.datamodel.document import DocItemLabel + + mock_item = MagicMock() + mock_item.label = DocItemLabel.PARAGRAPH + mock_item.text = "" + mock_item.export_to_markdown.return_value = "Fallback Markdown Content" - # This would require actual bytes of docx/xlsx etc. - pass + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = [(mock_item, 1)] + + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + + assert len(result.elements) == 1 + assert result.elements[0].content == "Fallback Markdown Content" + + async def test_specialized_element_types(self, ctx, tmp_path, monkeypatch): + """Verify mapping of list, table, and code elements.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + from docling.datamodel.document import DocItemLabel + + mock_items = [ + (MagicMock(label=DocItemLabel.LIST_ITEM, text="List item 1"), 1), + (MagicMock(label=DocItemLabel.TABLE, text="Table content"), 1), + (MagicMock(label=DocItemLabel.CODE, text="print('hello')"), 1), + ] + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = mock_items + + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + + elements = result.elements + assert elements[0].type == "list" + assert elements[1].type == "table" + assert elements[2].type == "code" From 857f870b838b721e078a15266c9b3537dced18c6 Mon Sep 17 00:00:00 2001 From: drr00t Date: Mon, 18 May 2026 21:48:22 -0300 Subject: [PATCH 4/8] fix(issues): from coderabbitai review --- graphrag_sdk/pyproject.toml | 2 +- graphrag_sdk/src/graphrag_sdk/api/main.py | 2 ++ graphrag_sdk/tests/test_docling_loaders.py | 10 ++++++++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/graphrag_sdk/pyproject.toml b/graphrag_sdk/pyproject.toml index 042dff15..f5ad2fae 100644 --- a/graphrag_sdk/pyproject.toml +++ b/graphrag_sdk/pyproject.toml @@ -64,7 +64,7 @@ all = [ "sentence-transformers>=2.0", "pypdf>=6.9.2", "litellm>=1.83.0,<2.0", - "docling>=2.0.0", + "docling>=2.91.0", ] dev = [ "pytest>=8.0", diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py index b267e380..ba1f46d0 100644 --- a/graphrag_sdk/src/graphrag_sdk/api/main.py +++ b/graphrag_sdk/src/graphrag_sdk/api/main.py @@ -327,6 +327,8 @@ async def ingest( Uses sensible defaults for any unspecified strategy: - Loader: auto-detected from file extension (PDF or text) + - Loader: auto-detected from file extension + (PDF, Markdown, DOCX, XLSX, PPTX, HTML/XHTML, CSV, or text fallback) - Chunker: SentenceTokenCapChunking(max_tokens=512, overlap_sentences=2) — sentence-aware, never splits entity names at chunk boundaries. Override with ``chunker=FixedSizeChunking(...)`` if you need diff --git a/graphrag_sdk/tests/test_docling_loaders.py b/graphrag_sdk/tests/test_docling_loaders.py index 6ad75aa8..caeadc6f 100644 --- a/graphrag_sdk/tests/test_docling_loaders.py +++ b/graphrag_sdk/tests/test_docling_loaders.py @@ -21,8 +21,14 @@ async def test_import_error_wrapped_in_loader_error(self, ctx, tmp_path, monkeyp loader = MockDocxLoader() # Mocking the import to raise ImportError - with patch("builtins.__import__", side_effect=lambda name, *args, **kwargs: - (exec('raise ImportError("module not found")') if name == "docling.document_converter" else None)): + real_import = __import__ + + def _import(name, *args, **kwargs): + if name == "docling.document_converter": + raise ImportError("module not found") + return real_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=_import): with pytest.raises(LoaderError, match=r"DOCX parsing requires 'docling'"): await loader.load(str(file), ctx) From 4af2f8c19a542bcd71e70d71c5f4c5be50e254e2 Mon Sep 17 00:00:00 2001 From: drr00t Date: Mon, 18 May 2026 21:52:31 -0300 Subject: [PATCH 5/8] fix(conflict): need to be updated --- graphrag_sdk/src/graphrag_sdk/api/main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py index ba1f46d0..b267e380 100644 --- a/graphrag_sdk/src/graphrag_sdk/api/main.py +++ b/graphrag_sdk/src/graphrag_sdk/api/main.py @@ -327,8 +327,6 @@ async def ingest( Uses sensible defaults for any unspecified strategy: - Loader: auto-detected from file extension (PDF or text) - - Loader: auto-detected from file extension - (PDF, Markdown, DOCX, XLSX, PPTX, HTML/XHTML, CSV, or text fallback) - Chunker: SentenceTokenCapChunking(max_tokens=512, overlap_sentences=2) — sentence-aware, never splits entity names at chunk boundaries. Override with ``chunker=FixedSizeChunking(...)`` if you need From 7969df9b825a39d4c2b3f96ba4ae400885bd41a0 Mon Sep 17 00:00:00 2001 From: Gal Shubeli Date: Mon, 18 May 2026 11:08:46 +0300 Subject: [PATCH 6/8] fix(retrieval): rank MENTIONED_IN chunks by cosine in MultiPath Path C MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Path C in retrieve_chunks used `COLLECT(c)[..3]` with no ORDER BY, so hub entities (which can be MENTIONED_IN hundreds of chunks) returned an arbitrary 3 — almost never including the chunks most relevant to the current query. Add an ORDER BY on `vec.cosineDistance(c.embedding, query_vector)` before the COLLECT so per-entity chunk selection is query-aware. Refs #258 Co-Authored-By: Claude Opus 4.7 (1M context) --- .../retrieval/strategies/chunk_retrieval.py | 12 ++++- .../tests/test_multi_path_retrieval.py | 49 +++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/chunk_retrieval.py b/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/chunk_retrieval.py index 3f42fcaa..e71e4455 100644 --- a/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/chunk_retrieval.py +++ b/graphrag_sdk/src/graphrag_sdk/retrieval/strategies/chunk_retrieval.py @@ -51,17 +51,25 @@ def _add(cid: str, text: str, source: str) -> None: except Exception as exc: logger.debug("Chunk vector search failed: %s", exc) - # Path C: MENTIONED_IN — 3 chunks per entity (batched UNWIND) + # Path C: MENTIONED_IN — top-3 chunks per entity, ranked by cosine + # distance to the query embedding. Hub entities (e.g. the main + # product name) can be MENTIONED_IN hundreds of chunks; the previous + # COLLECT(c)[..3] picked an arbitrary 3, almost never including the + # chunks most relevant to the current query. Ranking by cosine here + # surfaces the chunks closest to the query intent. eids_mention = [eid for eid, _ in entity_list[:15]] if eids_mention: try: result = await graph_store.query_raw( "UNWIND $eids AS eid " "MATCH (e:__Entity__ {id: eid})-[:MENTIONED_IN]->(c:Chunk) " + "WHERE c.embedding IS NOT NULL " + "WITH eid, c, vec.cosineDistance(c.embedding, vecf32($qv)) AS dist " + "ORDER BY eid, dist ASC " "WITH eid, COLLECT(c)[..3] AS chunks " "UNWIND chunks AS c " "RETURN eid, c.id AS id, c.text AS text", - {"eids": eids_mention}, + {"eids": eids_mention, "qv": query_vector}, ) for row in result.result_set: cid = row[1] diff --git a/graphrag_sdk/tests/test_multi_path_retrieval.py b/graphrag_sdk/tests/test_multi_path_retrieval.py index 53092946..b535730e 100644 --- a/graphrag_sdk/tests/test_multi_path_retrieval.py +++ b/graphrag_sdk/tests/test_multi_path_retrieval.py @@ -233,6 +233,55 @@ async def capture_query(cypher, params=None): ] assert len(twohop_chunk_queries) >= 1 + async def test_mentioned_in_ranks_chunks_by_cosine(self, mp_graph_store, mp_vector_store, mp_embedder, mp_llm): + """The MENTIONED_IN path must rank chunks per entity by cosine + distance to the query embedding — not pick an arbitrary 3. + + Hub entities can be MENTIONED_IN hundreds of chunks; arbitrary + selection almost never surfaces the chunks relevant to the + current query (regression: see PR referencing issue #258). + """ + # Seed entity discovery so Path C runs (mirrors the setup in + # ``test_mentioned_in_and_2hop_chunk_paths``). + mp_vector_store.search_relationships = AsyncMock(return_value=[ + {"src_name": "Alice", "type": "WORKS_AT", "tgt_name": "Acme", "fact": "engineer", "score": 0.9}, + ]) + + captured: list[tuple[str, dict]] = [] + + async def capture_query(cypher, params=None): + captured.append((cypher, params or {})) + result = MagicMock() + result.result_set = [] + return result + + mp_graph_store.query_raw = AsyncMock(side_effect=capture_query) + + s = MultiPathRetrieval( + graph_store=mp_graph_store, + vector_store=mp_vector_store, + embedder=mp_embedder, + llm=mp_llm, + ) + await s.search("Who is Alice?") + + # Find the direct MENTIONED_IN query (entity -> chunk, not 2-hop) + direct_mention = [ + (q, p) for q, p in captured + if "MENTIONED_IN" in q and "Chunk" in q and "neighbor" not in q.lower() + ] + assert direct_mention, "expected at least one direct MENTIONED_IN chunk query" + + cypher, params = direct_mention[0] + # The fix: rank by cosine distance to the query vector before + # COLLECT, so per-entity chunk selection is query-relevant. + assert "vec.cosineDistance" in cypher, ( + "MENTIONED_IN chunk query must rank by cosine distance to the " + "query vector (regression of issue #258)" + ) + assert "ORDER BY" in cypher, "expected ORDER BY to make COLLECT[..3] meaningful" + assert "qv" in params, "query vector must be passed as a parameter" + async def test_format_produces_sections(self, mp_graph_store, mp_vector_store, mp_embedder, mp_llm): """Output should include structured sections when data is available.""" mp_vector_store.search_relationships = AsyncMock(return_value=[ From 531d29c98fde9e0e90e0a6d2f748db177db2e08b Mon Sep 17 00:00:00 2001 From: drr00t Date: Thu, 21 May 2026 16:51:45 -0300 Subject: [PATCH 7/8] test: refactor docling loader tests to use local mock enumerations instead of actual imports --- graphrag_sdk/tests/test_docling_loaders.py | 264 ++++++++++++++------- 1 file changed, 174 insertions(+), 90 deletions(-) diff --git a/graphrag_sdk/tests/test_docling_loaders.py b/graphrag_sdk/tests/test_docling_loaders.py index caeadc6f..dedc4261 100644 --- a/graphrag_sdk/tests/test_docling_loaders.py +++ b/graphrag_sdk/tests/test_docling_loaders.py @@ -12,7 +12,6 @@ def extension_name(self) -> str: class TestDoclingBaseLoader: """Tests for DoclingBaseLoader and its derived loaders.""" - @pytest.mark.asyncio async def test_import_error_wrapped_in_loader_error(self, ctx, tmp_path, monkeypatch): """Verify that ImportError when docling is missing is wrapped in LoaderError.""" file = tmp_path / "test.docx" @@ -35,80 +34,138 @@ def _import(name, *args, **kwargs): async def test_label_mapping_and_metadata_preservation(self, ctx, tmp_path, monkeypatch): """Verify mapping of DocItemLabel and preservation of labels in metadata for fallback cases.""" - from docling.datamodel.document import DocItemLabel - - file = tmp_path / "test.docx" - file.write_text("dummy content") - - # Mock Docling's result structure - mock_item_header = MagicMock() - mock_item_header.label = DocItemLabel.SECTION_HEADER - mock_item_header.text = "Header 1" + class MockLabel: + def __init__(self, val): self.value = val + def __eq__(self, other): + if isinstance(other, type) and hasattr(other, "SECTION_HEADER"): # Handle the Enum check + return False # Not a direct match + return isinstance(other, MockLabel) and self.value == other.value - mock_item_para = MagicMock() - mock_item_para.label = DocItemLabel.PARAGRAPH - mock_item_para.text = "Paragraph 1" + class LabelEnum: + SECTION_HEADER = MockLabel("section_header") + PARAGRAPH = MockLabel("paragraph") + FOOTNOTE = MockLabel("footnote") + TITLE = MockLabel("title") + TEXT = MockLabel("text") + LIST_ITEM = MockLabel("list_item") + TABLE = MockLabel("table") + CODE = MockLabel("code") - mock_item_footnote = MagicMock() - mock_item_footnote.label = DocItemLabel.FOOTNOTE - mock_item_footnote.text = "Footnote content" + # The loader does 'from docling.datamodel.document import DocItemLabel' inside _load_sync + # We need to mock the module and the import process. + import sys + mock_docling_datamodel = MagicMock() + mock_docling_datamodel.document = MagicMock() - mock_doc = MagicMock() - mock_doc.iterate_items.return_value = [ - (mock_item_header, 1), - (mock_item_para, 2), - (mock_item_footnote, 2), - ] - - mock_converter = MagicMock() - mock_converter.convert.return_value.document = mock_doc - - monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + # We create a fake module structure + mock_document_module = MagicMock() + mock_document_module.DocItemLabel = LabelEnum + + # Patching the import mechanism + real_import = __import__ + def mock_import(name, *args, **kwargs): + if name == "docling.datamodel.document": + return mock_document_module + return real_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=mock_import): + file = tmp_path / "test.docx" + file.write_text("dummy content") + + # Mock Docling's result structure + mock_item_header = MagicMock() + mock_item_header.label = LabelEnum.SECTION_HEADER + mock_item_header.text = "Header 1" + + mock_item_para = MagicMock() + mock_item_para.label = LabelEnum.PARAGRAPH + mock_item_para.text = "Paragraph 1" + + mock_item_footnote = MagicMock() + mock_item_footnote.label = LabelEnum.FOOTNOTE + mock_item_footnote.text = "Footnote content" + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = [ + (mock_item_header, 1), + (mock_item_para, 2), + (mock_item_footnote, 2), + ] + + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + + elements = result.elements + assert len(elements) == 3 + assert elements[0].type == "header" + assert elements[0].content == "Header 1" + assert elements[1].type == "paragraph" + assert elements[1].content == "Paragraph 1" + # Check fallback and metadata preservation + assert elements[2].type == "paragraph" + assert elements[2].content == "Footnote content" + assert elements[2].metadata["label"] == LabelEnum.FOOTNOTE.value - loader = MockDocxLoader() - result = await loader.load(str(file), ctx) - elements = result.elements - assert len(elements) == 3 - assert elements[0].type == "header" - assert elements[0].content == "Header 1" - assert elements[1].type == "paragraph" - assert elements[1].content == "Paragraph 1" - # Check fallback and metadata preservation - assert elements[2].type == "paragraph" - assert elements[2].content == "Footnote content" - assert elements[2].metadata["label"] == DocItemLabel.FOOTNOTE.value async def test_breadcrumbs_construction(self, ctx, tmp_path, monkeypatch): """Verify the breadcrumbs are built correctly following the header hierarchy.""" - file = tmp_path / "test.docx" - file.write_text("dummy content") - - from docling.datamodel.document import DocItemLabel - - # Hierarchy: H1 -> H2 -> P -> H2 (new) -> P - mock_items = [] - mock_items.append((MagicMock(label=DocItemLabel.SECTION_HEADER, text="Root"), 1)) - mock_items.append((MagicMock(label=DocItemLabel.SECTION_HEADER, text="Child"), 2)) - mock_items.append((MagicMock(label=DocItemLabel.PARAGRAPH, text="Text 1"), 3)) - mock_items.append((MagicMock(label=DocItemLabel.SECTION_HEADER, text="Sibling"), 2)) - mock_items.append((MagicMock(label=DocItemLabel.PARAGRAPH, text="Text 2"), 3)) - - mock_doc = MagicMock() - mock_doc.iterate_items.return_value = mock_items - mock_converter = MagicMock() - mock_converter.convert.return_value.document = mock_doc - monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) - - loader = MockDocxLoader() - result = await loader.load(str(file), ctx) + class MockLabel: + def __init__(self, val): self.value = val + def __eq__(self, other): + return isinstance(other, MockLabel) and self.value == other.value + + class LabelEnum: + SECTION_HEADER = MockLabel("section_header") + PARAGRAPH = MockLabel("paragraph") + TITLE = MockLabel("title") + TEXT = MockLabel("text") + LIST_ITEM = MockLabel("list_item") + TABLE = MockLabel("table") + CODE = MockLabel("code") + + import sys + mock_document_module = MagicMock() + mock_document_module.DocItemLabel = LabelEnum + + real_import = __import__ + def mock_import(name, *args, **kwargs): + if name == "docling.datamodel.document": + return mock_document_module + return real_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=mock_import): + file = tmp_path / "test.docx" + file.write_text("dummy content") + + # Hierarchy: H1 -> H2 -> P -> H2 (new) -> P + mock_items = [] + mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Root"), 1)) + mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Child"), 2)) + mock_items.append((MagicMock(label=LabelEnum.PARAGRAPH, text="Text 1"), 3)) + mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Sibling"), 2)) + mock_items.append((MagicMock(label=LabelEnum.PARAGRAPH, text="Text 2"), 3)) + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = mock_items + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + + elements = result.elements + assert elements[0].breadcrumbs == ["Root"] + assert elements[1].breadcrumbs == ["Root", "Child"] + assert elements[2].breadcrumbs == ["Root", "Child"] + assert elements[3].breadcrumbs == ["Root", "Sibling"] + assert elements[4].breadcrumbs == ["Root", "Sibling"] - elements = result.elements - assert elements[0].breadcrumbs == ["Root"] - assert elements[1].breadcrumbs == ["Root", "Child"] - assert elements[2].breadcrumbs == ["Root", "Child"] - assert elements[3].breadcrumbs == ["Root", "Sibling"] - assert elements[4].breadcrumbs == ["Root", "Sibling"] async def test_file_not_found(self, ctx): """Verify that LoaderError is raised when the file does not exist.""" @@ -131,11 +188,16 @@ async def test_docling_conversion_failure(self, ctx, tmp_path, monkeypatch): async def test_export_to_markdown_fallback(self, ctx, tmp_path, monkeypatch): """Verify fallback to export_to_markdown when text attribute is empty.""" + # Mock DocItemLabel to avoid importing docling + class MockLabel: + def __init__(self, val): self.value = val + class LabelEnum: + PARAGRAPH = MockLabel("paragraph") + DocItemLabel = LabelEnum + file = tmp_path / "test.docx" file.write_text("dummy content") - from docling.datamodel.document import DocItemLabel - mock_item = MagicMock() mock_item.label = DocItemLabel.PARAGRAPH mock_item.text = "" @@ -156,28 +218,50 @@ async def test_export_to_markdown_fallback(self, ctx, tmp_path, monkeypatch): async def test_specialized_element_types(self, ctx, tmp_path, monkeypatch): """Verify mapping of list, table, and code elements.""" - file = tmp_path / "test.docx" - file.write_text("dummy content") - - from docling.datamodel.document import DocItemLabel - - mock_items = [ - (MagicMock(label=DocItemLabel.LIST_ITEM, text="List item 1"), 1), - (MagicMock(label=DocItemLabel.TABLE, text="Table content"), 1), - (MagicMock(label=DocItemLabel.CODE, text="print('hello')"), 1), - ] + class MockLabel: + def __init__(self, val): self.value = val + def __eq__(self, other): return isinstance(other, MockLabel) and self.value == other.value + class LabelEnum: + SECTION_HEADER = MockLabel("section_header") + PARAGRAPH = MockLabel("paragraph") + TITLE = MockLabel("title") + TEXT = MockLabel("text") + LIST_ITEM = MockLabel("list_item") + TABLE = MockLabel("table") + CODE = MockLabel("code") - mock_doc = MagicMock() - mock_doc.iterate_items.return_value = mock_items + import sys + mock_document_module = MagicMock() + mock_document_module.DocItemLabel = LabelEnum - mock_converter = MagicMock() - mock_converter.convert.return_value.document = mock_doc - monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) - - loader = MockDocxLoader() - result = await loader.load(str(file), ctx) + real_import = __import__ + def mock_import(name, *args, **kwargs): + if name == "docling.datamodel.document": + return mock_document_module + return real_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=mock_import): + file = tmp_path / "test.docx" + file.write_text("dummy content") + + mock_items = [ + (MagicMock(label=LabelEnum.LIST_ITEM, text="List item 1"), 1), + (MagicMock(label=LabelEnum.TABLE, text="Table content"), 1), + (MagicMock(label=LabelEnum.CODE, text="print('hello')"), 1), + ] + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = mock_items + + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + + elements = result.elements + assert elements[0].type == "list" + assert elements[1].type == "table" + assert elements[2].type == "code" - elements = result.elements - assert elements[0].type == "list" - assert elements[1].type == "table" - assert elements[2].type == "code" From a5ecc4ee21b66b68dc91517840a3b6598d65b195 Mon Sep 17 00:00:00 2001 From: drr00t Date: Thu, 21 May 2026 21:25:03 -0300 Subject: [PATCH 8/8] test: implement robust sys.modules mocking for docling in unit tests and add debug helpers --- graphrag_sdk/dummy_path | 0 graphrag_sdk/pyproject.toml | 6 + graphrag_sdk/test_debug.py | 28 ++ graphrag_sdk/test_monkeypatch.py | 19 ++ graphrag_sdk/test_monkeypatch2.py | 35 +++ graphrag_sdk/test_monkeypatch3.py | 35 +++ graphrag_sdk/tests/test_docling_loaders.py | 309 +++++++++------------ test_import_mock.py | 15 + test_mock_import.py | 15 + test_patch_dict.py | 5 + test_sys_modules.py | 6 + 11 files changed, 292 insertions(+), 181 deletions(-) create mode 100644 graphrag_sdk/dummy_path create mode 100644 graphrag_sdk/test_debug.py create mode 100644 graphrag_sdk/test_monkeypatch.py create mode 100644 graphrag_sdk/test_monkeypatch2.py create mode 100644 graphrag_sdk/test_monkeypatch3.py create mode 100644 test_import_mock.py create mode 100644 test_mock_import.py create mode 100644 test_patch_dict.py create mode 100644 test_sys_modules.py diff --git a/graphrag_sdk/dummy_path b/graphrag_sdk/dummy_path new file mode 100644 index 00000000..e69de29b diff --git a/graphrag_sdk/pyproject.toml b/graphrag_sdk/pyproject.toml index f5ad2fae..46a70f36 100644 --- a/graphrag_sdk/pyproject.toml +++ b/graphrag_sdk/pyproject.toml @@ -99,3 +99,9 @@ plugins = ["pydantic.mypy"] [tool.pytest.ini_options] asyncio_mode = "auto" testpaths = ["tests"] +filterwarnings = [ + "ignore:builtin type SwigPyObject has no __module__ attribute:DeprecationWarning", + "ignore:.*hf_xet.download_files\\(\\) is deprecated.*:DeprecationWarning", + "ignore:.*`torch.jit.script` is deprecated.*:DeprecationWarning", + "ignore:.*The `resume_download` argument is deprecated.*:UserWarning" +] diff --git a/graphrag_sdk/test_debug.py b/graphrag_sdk/test_debug.py new file mode 100644 index 00000000..ee4d839f --- /dev/null +++ b/graphrag_sdk/test_debug.py @@ -0,0 +1,28 @@ +import asyncio +from unittest.mock import MagicMock +from graphrag_sdk.core.context import Context +from tests.test_docling_loaders import MockDocxLoader, LabelEnum + +loader = MockDocxLoader() +mock_items = [ + (MagicMock(label=LabelEnum.LIST_ITEM, text="List item 1"), 1), + (MagicMock(label=LabelEnum.TABLE, text="Table content"), 1), + (MagicMock(label=LabelEnum.CODE, text="print('hello')"), 1), +] + +mock_doc = MagicMock() +mock_doc.iterate_items.return_value = mock_items + +mock_converter = MagicMock() +mock_converter.convert.return_value.document = mock_doc + +# Force the monkeypatch manually +import sys +sys.modules["docling"] = MagicMock() +sys.modules["docling.datamodel"] = MagicMock() +sys.modules["docling.datamodel.document"] = MagicMock(DocItemLabel=LabelEnum) +sys.modules["docling.document_converter"] = MagicMock(DocumentConverter=MagicMock(return_value=mock_converter)) + +ctx = Context() +result = loader._load_sync("dummy_path") +print(result.elements) diff --git a/graphrag_sdk/test_monkeypatch.py b/graphrag_sdk/test_monkeypatch.py new file mode 100644 index 00000000..4dd8ca2b --- /dev/null +++ b/graphrag_sdk/test_monkeypatch.py @@ -0,0 +1,19 @@ +import sys, asyncio +from unittest.mock import patch, MagicMock + +async def main(): + mock_docling = MagicMock() + # mock_docling.__path__ = [] # Let's see without this + modules = { + 'docling': mock_docling, + 'docling.datamodel': MagicMock(), + 'docling.datamodel.document': MagicMock() + } + with patch.dict('sys.modules', modules): + def worker(): + from docling.datamodel.document import DocItemLabel + return 'success' + res = await asyncio.to_thread(worker) + print(res) + +asyncio.run(main()) diff --git a/graphrag_sdk/test_monkeypatch2.py b/graphrag_sdk/test_monkeypatch2.py new file mode 100644 index 00000000..e9c35d70 --- /dev/null +++ b/graphrag_sdk/test_monkeypatch2.py @@ -0,0 +1,35 @@ +import sys, asyncio, pytest +from unittest.mock import patch, MagicMock + +async def load_sync(): + from docling.datamodel.document import DocItemLabel + return "success" + +async def test_first(): + real_import = __import__ + def _import(name, *args, **kwargs): + if name == "docling.datamodel.document": + raise ImportError("module not found") + return real_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=_import): + try: + await asyncio.to_thread(load_sync) + except Exception as e: + pass # catch the mocked exception + +async def test_second(): + res = await asyncio.to_thread(load_sync) + print("Second test:", res) + +async def main(): + modules = { + 'docling': MagicMock(), + 'docling.datamodel': MagicMock(), + 'docling.datamodel.document': MagicMock() + } + with patch.dict('sys.modules', modules): + await test_first() + await test_second() + +asyncio.run(main()) diff --git a/graphrag_sdk/test_monkeypatch3.py b/graphrag_sdk/test_monkeypatch3.py new file mode 100644 index 00000000..03bd0823 --- /dev/null +++ b/graphrag_sdk/test_monkeypatch3.py @@ -0,0 +1,35 @@ +import sys, asyncio +from unittest.mock import patch, MagicMock + +def load_sync(): + from docling.datamodel.document import DocItemLabel + return "success" + +async def test_first(): + real_import = __import__ + def _import(name, *args, **kwargs): + if name == "docling.datamodel.document": + raise ImportError("module not found") + return real_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=_import): + try: + await asyncio.to_thread(load_sync) + except Exception as e: + pass # catch the mocked exception + +async def test_second(): + res = await asyncio.to_thread(load_sync) + print("Second test:", res) + +async def main(): + modules = { + 'docling': MagicMock(), + 'docling.datamodel': MagicMock(), + 'docling.datamodel.document': MagicMock() + } + with patch.dict('sys.modules', modules): + await test_first() + await test_second() + +asyncio.run(main()) diff --git a/graphrag_sdk/tests/test_docling_loaders.py b/graphrag_sdk/tests/test_docling_loaders.py index dedc4261..da8022c7 100644 --- a/graphrag_sdk/tests/test_docling_loaders.py +++ b/graphrag_sdk/tests/test_docling_loaders.py @@ -1,4 +1,5 @@ import pytest +import sys from unittest.mock import MagicMock, patch from graphrag_sdk.core.exceptions import LoaderError from graphrag_sdk.core.models import DocumentOutput, DocumentInfo, DocumentElement @@ -9,10 +10,52 @@ class MockDocxLoader(DoclingBaseLoader): def extension_name(self) -> str: return "docx" +class MockLabel: + def __init__(self, val): self.value = val + def __eq__(self, other): + if isinstance(other, type) and hasattr(other, "SECTION_HEADER"): + return False # Not a direct match + return isinstance(other, MockLabel) and self.value == other.value + +class LabelEnum: + SECTION_HEADER = MockLabel("section_header") + PARAGRAPH = MockLabel("paragraph") + FOOTNOTE = MockLabel("footnote") + TITLE = MockLabel("title") + TEXT = MockLabel("text") + LIST_ITEM = MockLabel("list_item") + TABLE = MockLabel("table") + CODE = MockLabel("code") + class TestDoclingBaseLoader: """Tests for DoclingBaseLoader and its derived loaders.""" - async def test_import_error_wrapped_in_loader_error(self, ctx, tmp_path, monkeypatch): + @pytest.fixture(autouse=True) + def mock_docling_modules(self): + """Mock the docling module namespace in sys.modules.""" + mock_datamodel = MagicMock() + mock_datamodel.DocItemLabel = LabelEnum + + mock_converter_mod = MagicMock() + mock_converter_mod.DocumentConverter = MagicMock() + + mock_docling = MagicMock() + mock_docling.__path__ = [] + + mock_datamodel_pkg = MagicMock() + mock_datamodel_pkg.__path__ = [] + + modules = { + "docling": mock_docling, + "docling.datamodel": mock_datamodel_pkg, + "docling.datamodel.document": mock_datamodel, + "docling.document_converter": mock_converter_mod, + } + + with patch.dict("sys.modules", modules): + yield + + async def test_import_error_wrapped_in_loader_error(self, ctx, tmp_path): """Verify that ImportError when docling is missing is wrapped in LoaderError.""" file = tmp_path / "test.docx" file.write_text("dummy content") @@ -21,7 +64,6 @@ async def test_import_error_wrapped_in_loader_error(self, ctx, tmp_path, monkeyp # Mocking the import to raise ImportError real_import = __import__ - def _import(name, *args, **kwargs): if name == "docling.document_converter": raise ImportError("module not found") @@ -31,141 +73,77 @@ def _import(name, *args, **kwargs): with pytest.raises(LoaderError, match=r"DOCX parsing requires 'docling'"): await loader.load(str(file), ctx) - - async def test_label_mapping_and_metadata_preservation(self, ctx, tmp_path, monkeypatch): + async def test_label_mapping_and_metadata_preservation(self, ctx, tmp_path): """Verify mapping of DocItemLabel and preservation of labels in metadata for fallback cases.""" - class MockLabel: - def __init__(self, val): self.value = val - def __eq__(self, other): - if isinstance(other, type) and hasattr(other, "SECTION_HEADER"): # Handle the Enum check - return False # Not a direct match - return isinstance(other, MockLabel) and self.value == other.value - - class LabelEnum: - SECTION_HEADER = MockLabel("section_header") - PARAGRAPH = MockLabel("paragraph") - FOOTNOTE = MockLabel("footnote") - TITLE = MockLabel("title") - TEXT = MockLabel("text") - LIST_ITEM = MockLabel("list_item") - TABLE = MockLabel("table") - CODE = MockLabel("code") + file = tmp_path / "test.docx" + file.write_text("dummy content") + + # Mock Docling's result structure + mock_item_header = MagicMock() + mock_item_header.label = LabelEnum.SECTION_HEADER + mock_item_header.text = "Header 1" - # The loader does 'from docling.datamodel.document import DocItemLabel' inside _load_sync - # We need to mock the module and the import process. - import sys - mock_docling_datamodel = MagicMock() - mock_docling_datamodel.document = MagicMock() + mock_item_para = MagicMock() + mock_item_para.label = LabelEnum.PARAGRAPH + mock_item_para.text = "Paragraph 1" - # We create a fake module structure - mock_document_module = MagicMock() - mock_document_module.DocItemLabel = LabelEnum + mock_item_footnote = MagicMock() + mock_item_footnote.label = LabelEnum.FOOTNOTE + mock_item_footnote.text = "Footnote content" - # Patching the import mechanism - real_import = __import__ - def mock_import(name, *args, **kwargs): - if name == "docling.datamodel.document": - return mock_document_module - return real_import(name, *args, **kwargs) - - with patch("builtins.__import__", side_effect=mock_import): - file = tmp_path / "test.docx" - file.write_text("dummy content") - - # Mock Docling's result structure - mock_item_header = MagicMock() - mock_item_header.label = LabelEnum.SECTION_HEADER - mock_item_header.text = "Header 1" - - mock_item_para = MagicMock() - mock_item_para.label = LabelEnum.PARAGRAPH - mock_item_para.text = "Paragraph 1" - - mock_item_footnote = MagicMock() - mock_item_footnote.label = LabelEnum.FOOTNOTE - mock_item_footnote.text = "Footnote content" - - mock_doc = MagicMock() - mock_doc.iterate_items.return_value = [ - (mock_item_header, 1), - (mock_item_para, 2), - (mock_item_footnote, 2), - ] - - mock_converter = MagicMock() - mock_converter.convert.return_value.document = mock_doc - monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) - - loader = MockDocxLoader() - result = await loader.load(str(file), ctx) - - elements = result.elements - assert len(elements) == 3 - assert elements[0].type == "header" - assert elements[0].content == "Header 1" - assert elements[1].type == "paragraph" - assert elements[1].content == "Paragraph 1" - # Check fallback and metadata preservation - assert elements[2].type == "paragraph" - assert elements[2].content == "Footnote content" - assert elements[2].metadata["label"] == LabelEnum.FOOTNOTE.value + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = [ + (mock_item_header, 1), + (mock_item_para, 2), + (mock_item_footnote, 2), + ] + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) - async def test_breadcrumbs_construction(self, ctx, tmp_path, monkeypatch): + elements = result.elements + assert len(elements) == 3 + assert elements[0].type == "header" + assert elements[0].content == "Header 1" + assert elements[1].type == "paragraph" + assert elements[1].content == "Paragraph 1" + # Check fallback and metadata preservation + assert elements[2].type == "paragraph" + assert elements[2].content == "Footnote content" + assert elements[2].metadata["label"] == LabelEnum.FOOTNOTE.value + + async def test_breadcrumbs_construction(self, ctx, tmp_path): """Verify the breadcrumbs are built correctly following the header hierarchy.""" - class MockLabel: - def __init__(self, val): self.value = val - def __eq__(self, other): - return isinstance(other, MockLabel) and self.value == other.value - - class LabelEnum: - SECTION_HEADER = MockLabel("section_header") - PARAGRAPH = MockLabel("paragraph") - TITLE = MockLabel("title") - TEXT = MockLabel("text") - LIST_ITEM = MockLabel("list_item") - TABLE = MockLabel("table") - CODE = MockLabel("code") - - import sys - mock_document_module = MagicMock() - mock_document_module.DocItemLabel = LabelEnum - - real_import = __import__ - def mock_import(name, *args, **kwargs): - if name == "docling.datamodel.document": - return mock_document_module - return real_import(name, *args, **kwargs) - - with patch("builtins.__import__", side_effect=mock_import): - file = tmp_path / "test.docx" - file.write_text("dummy content") - - # Hierarchy: H1 -> H2 -> P -> H2 (new) -> P - mock_items = [] - mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Root"), 1)) - mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Child"), 2)) - mock_items.append((MagicMock(label=LabelEnum.PARAGRAPH, text="Text 1"), 3)) - mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Sibling"), 2)) - mock_items.append((MagicMock(label=LabelEnum.PARAGRAPH, text="Text 2"), 3)) + file = tmp_path / "test.docx" + file.write_text("dummy content") - mock_doc = MagicMock() - mock_doc.iterate_items.return_value = mock_items - mock_converter = MagicMock() - mock_converter.convert.return_value.document = mock_doc - monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + # Hierarchy: H1 -> H2 -> P -> H2 (new) -> P + mock_items = [] + mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Root"), 1)) + mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Child"), 2)) + mock_items.append((MagicMock(label=LabelEnum.PARAGRAPH, text="Text 1"), 3)) + mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Sibling"), 2)) + mock_items.append((MagicMock(label=LabelEnum.PARAGRAPH, text="Text 2"), 3)) - loader = MockDocxLoader() - result = await loader.load(str(file), ctx) + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = mock_items + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter - elements = result.elements - assert elements[0].breadcrumbs == ["Root"] - assert elements[1].breadcrumbs == ["Root", "Child"] - assert elements[2].breadcrumbs == ["Root", "Child"] - assert elements[3].breadcrumbs == ["Root", "Sibling"] - assert elements[4].breadcrumbs == ["Root", "Sibling"] + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + elements = result.elements + assert elements[0].breadcrumbs == ["Root"] + assert elements[1].breadcrumbs == ["Root", "Child"] + assert elements[2].breadcrumbs == ["Root", "Child"] + assert elements[3].breadcrumbs == ["Root", "Sibling"] + assert elements[4].breadcrumbs == ["Root", "Sibling"] async def test_file_not_found(self, ctx): """Verify that LoaderError is raised when the file does not exist.""" @@ -173,33 +151,26 @@ async def test_file_not_found(self, ctx): with pytest.raises(LoaderError, match="File not found"): await loader.load("/non/existent/path.docx", ctx) - async def test_docling_conversion_failure(self, ctx, tmp_path, monkeypatch): + async def test_docling_conversion_failure(self, ctx, tmp_path): """Verify that exceptions during docling conversion are wrapped in LoaderError.""" file = tmp_path / "test.docx" file.write_text("dummy content") mock_converter = MagicMock() mock_converter.convert.side_effect = Exception("Conversion failed") - monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter loader = MockDocxLoader() with pytest.raises(LoaderError, match="Docling failed to process"): await loader.load(str(file), ctx) - async def test_export_to_markdown_fallback(self, ctx, tmp_path, monkeypatch): + async def test_export_to_markdown_fallback(self, ctx, tmp_path): """Verify fallback to export_to_markdown when text attribute is empty.""" - # Mock DocItemLabel to avoid importing docling - class MockLabel: - def __init__(self, val): self.value = val - class LabelEnum: - PARAGRAPH = MockLabel("paragraph") - DocItemLabel = LabelEnum - file = tmp_path / "test.docx" file.write_text("dummy content") mock_item = MagicMock() - mock_item.label = DocItemLabel.PARAGRAPH + mock_item.label = LabelEnum.PARAGRAPH mock_item.text = "" mock_item.export_to_markdown.return_value = "Fallback Markdown Content" @@ -208,7 +179,7 @@ class LabelEnum: mock_converter = MagicMock() mock_converter.convert.return_value.document = mock_doc - monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter loader = MockDocxLoader() result = await loader.load(str(file), ctx) @@ -216,52 +187,28 @@ class LabelEnum: assert len(result.elements) == 1 assert result.elements[0].content == "Fallback Markdown Content" - async def test_specialized_element_types(self, ctx, tmp_path, monkeypatch): + async def test_specialized_element_types(self, ctx, tmp_path): """Verify mapping of list, table, and code elements.""" - class MockLabel: - def __init__(self, val): self.value = val - def __eq__(self, other): return isinstance(other, MockLabel) and self.value == other.value - class LabelEnum: - SECTION_HEADER = MockLabel("section_header") - PARAGRAPH = MockLabel("paragraph") - TITLE = MockLabel("title") - TEXT = MockLabel("text") - LIST_ITEM = MockLabel("list_item") - TABLE = MockLabel("table") - CODE = MockLabel("code") - - import sys - mock_document_module = MagicMock() - mock_document_module.DocItemLabel = LabelEnum + file = tmp_path / "test.docx" + file.write_text("dummy content") + + mock_items = [ + (MagicMock(label=LabelEnum.LIST_ITEM, text="List item 1"), 1), + (MagicMock(label=LabelEnum.TABLE, text="Table content"), 1), + (MagicMock(label=LabelEnum.CODE, text="print('hello')"), 1), + ] - real_import = __import__ - def mock_import(name, *args, **kwargs): - if name == "docling.datamodel.document": - return mock_document_module - return real_import(name, *args, **kwargs) + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = mock_items - with patch("builtins.__import__", side_effect=mock_import): - file = tmp_path / "test.docx" - file.write_text("dummy content") - - mock_items = [ - (MagicMock(label=LabelEnum.LIST_ITEM, text="List item 1"), 1), - (MagicMock(label=LabelEnum.TABLE, text="Table content"), 1), - (MagicMock(label=LabelEnum.CODE, text="print('hello')"), 1), - ] - - mock_doc = MagicMock() - mock_doc.iterate_items.return_value = mock_items - - mock_converter = MagicMock() - mock_converter.convert.return_value.document = mock_doc - monkeypatch.setattr("docling.document_converter.DocumentConverter", lambda **kwargs: mock_converter) - - loader = MockDocxLoader() - result = await loader.load(str(file), ctx) + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter - elements = result.elements - assert elements[0].type == "list" - assert elements[1].type == "table" - assert elements[2].type == "code" + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + elements = result.elements + assert elements[0].type == "list" + assert elements[1].type == "table" + assert elements[2].type == "code" diff --git a/test_import_mock.py b/test_import_mock.py new file mode 100644 index 00000000..be131461 --- /dev/null +++ b/test_import_mock.py @@ -0,0 +1,15 @@ +import sys +from unittest.mock import patch, MagicMock + +modules = {"fake": MagicMock()} +with patch.dict("sys.modules", modules): + def _import(name, *args, **kwargs): + if name == "fake": + raise ImportError("module not found") + return __import__(name, *args, **kwargs) + with patch("builtins.__import__", side_effect=_import): + try: + import fake + print("Import succeeded") + except ImportError: + print("ImportError raised") diff --git a/test_mock_import.py b/test_mock_import.py new file mode 100644 index 00000000..681b8260 --- /dev/null +++ b/test_mock_import.py @@ -0,0 +1,15 @@ +import sys +from unittest.mock import MagicMock +class LabelEnum: + LIST_ITEM = "list_item" + +mock_datamodel = MagicMock() +mock_datamodel.DocItemLabel = LabelEnum + +sys.modules["docling"] = MagicMock() +sys.modules["docling.datamodel"] = MagicMock() +sys.modules["docling.datamodel.document"] = mock_datamodel + +from docling.datamodel.document import DocItemLabel +print("DocItemLabel is:", DocItemLabel) +print("DocItemLabel.LIST_ITEM is:", getattr(DocItemLabel, "LIST_ITEM", None)) diff --git a/test_patch_dict.py b/test_patch_dict.py new file mode 100644 index 00000000..44bf9000 --- /dev/null +++ b/test_patch_dict.py @@ -0,0 +1,5 @@ +import sys +from unittest.mock import patch, MagicMock +with patch.dict("sys.modules", {"docling": MagicMock(), "docling.document_converter": MagicMock()}): + import docling.document_converter + print("Success patch.dict!") diff --git a/test_sys_modules.py b/test_sys_modules.py new file mode 100644 index 00000000..613805cf --- /dev/null +++ b/test_sys_modules.py @@ -0,0 +1,6 @@ +import sys +from unittest.mock import MagicMock +sys.modules['docling'] = MagicMock() +sys.modules['docling.document_converter'] = MagicMock() +import docling.document_converter +print("Success!")