diff --git a/graphrag_sdk/dummy_path b/graphrag_sdk/dummy_path new file mode 100644 index 00000000..e69de29b diff --git a/graphrag_sdk/pyproject.toml b/graphrag_sdk/pyproject.toml index be8597fd..46a70f36 100644 --- a/graphrag_sdk/pyproject.toml +++ b/graphrag_sdk/pyproject.toml @@ -56,6 +56,7 @@ litellm = ["litellm>=1.83.0,<2.0"] openrouter = ["openai>=1.0,<2.0"] fastcoref = ["fastcoref>=2.0"] spacy = ["spacy>=3.0"] +docling = ["docling>=2.91.0"] all = [ "openai>=1.0,<2.0", "anthropic>=0.20,<1.0", @@ -63,6 +64,7 @@ all = [ "sentence-transformers>=2.0", "pypdf>=6.9.2", "litellm>=1.83.0,<2.0", + "docling>=2.91.0", ] dev = [ "pytest>=8.0", @@ -97,3 +99,9 @@ plugins = ["pydantic.mypy"] [tool.pytest.ini_options] asyncio_mode = "auto" testpaths = ["tests"] +filterwarnings = [ + "ignore:builtin type SwigPyObject has no __module__ attribute:DeprecationWarning", + "ignore:.*hf_xet.download_files\\(\\) is deprecated.*:DeprecationWarning", + "ignore:.*`torch.jit.script` is deprecated.*:DeprecationWarning", + "ignore:.*The `resume_download` argument is deprecated.*:UserWarning" +] diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py index 684c7a2d..b267e380 100644 --- a/graphrag_sdk/src/graphrag_sdk/api/main.py +++ b/graphrag_sdk/src/graphrag_sdk/api/main.py @@ -37,9 +37,14 @@ from graphrag_sdk.ingestion.extraction_strategies.base import ExtractionStrategy from graphrag_sdk.ingestion.extraction_strategies.graph_extraction import GraphExtraction from graphrag_sdk.ingestion.loaders.base import LoaderStrategy +from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader +from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader +from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader +from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader from graphrag_sdk.ingestion.loaders.text_loader import TextLoader +from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader from graphrag_sdk.ingestion.pipeline import IngestionPipeline from graphrag_sdk.ingestion.resolution_strategies.base import ResolutionStrategy from graphrag_sdk.ingestion.resolution_strategies.exact_match import ExactMatchResolution @@ -637,6 +642,16 @@ def _default_loader_for(source: str) -> LoaderStrategy: return PdfLoader() if lower.endswith(".md"): return MarkdownLoader() + if lower.endswith(".docx"): + return DocxLoader() + if lower.endswith(".xlsx"): + return XlsxLoader() + if lower.endswith(".pptx"): + return PptxLoader() + if lower.endswith(".html") or lower.endswith(".xhtml"): + return HtmlLoader() + if lower.endswith(".csv"): + return CsvLoader() return TextLoader() # ── Incremental Updates ───────────────────────────────────── diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py index 9a781bca..01001905 100644 --- a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py @@ -1,8 +1,23 @@ # GraphRAG SDK — Ingestion: Loaders from graphrag_sdk.ingestion.loaders.base import LoaderStrategy +from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader +from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader +from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader +from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader from graphrag_sdk.ingestion.loaders.text_loader import TextLoader +from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader -__all__ = ["LoaderStrategy", "MarkdownLoader", "PdfLoader", "TextLoader"] +__all__ = [ + "LoaderStrategy", + "MarkdownLoader", + "PdfLoader", + "TextLoader", + "DocxLoader", + "XlsxLoader", + "PptxLoader", + "HtmlLoader", + "CsvLoader", +] diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py new file mode 100644 index 00000000..7bb7802d --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: CSV Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class CsvLoader(DoclingBaseLoader): + """Load text and structural elements from a CSV file using Docling.""" + + @property + def extension_name(self) -> str: + return "csv" diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py new file mode 100644 index 00000000..e03b314a --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py @@ -0,0 +1,154 @@ +# GraphRAG SDK — Ingestion: Docling Base Loader +# Pattern: Strategy Base Class + +from __future__ import annotations + +import asyncio +import logging +from pathlib import Path +from typing import Any + +from graphrag_sdk.core.context import Context +from graphrag_sdk.core.exceptions import LoaderError +from graphrag_sdk.core.models import DocumentElement, DocumentInfo, DocumentOutput +from graphrag_sdk.ingestion.loaders.base import LoaderStrategy + +logger = logging.getLogger(__name__) + + +class DoclingBaseLoader(LoaderStrategy): + """Base loader using docling for advanced document parsing. + + Subclasses should define the `extension_name` property. + """ + + def __init__(self, **docling_kwargs: Any) -> None: + """Initialize the loader. + + Args: + **docling_kwargs: Arbitrary keyword arguments passed to + `docling.document_converter.DocumentConverter` (e.g., + pipeline_options). + """ + self.docling_kwargs = docling_kwargs + + @property + def extension_name(self) -> str: + return "unknown" + + async def load(self, source: str, ctx: Context) -> DocumentOutput: + ctx.log(f"Loading {self.extension_name.upper()} file via docling: {source}") + # Run synchronous docling extraction in a non-blocking thread + return await asyncio.to_thread(self._load_sync, source) + + def _load_sync(self, source: str) -> DocumentOutput: + path = Path(source) + if not path.exists(): + raise LoaderError(f"File not found: {source}") + + try: + from docling.datamodel.document import DocItemLabel + from docling.document_converter import DocumentConverter + except ImportError: + raise LoaderError( + f"{self.extension_name.upper()} parsing requires 'docling'. Install with:\n" + " pip install graphrag-sdk[docling]" + ) + + try: + converter = DocumentConverter(**self.docling_kwargs) + result = converter.convert(source) + doc = result.document + except Exception as exc: + raise LoaderError(f"Docling failed to process {source}: {exc}") from exc + + elements: list[DocumentElement] = [] + current_breadcrumbs: list[tuple[int, str]] = [] + full_text_blocks = [] + + # Map docling hierarchy to GraphRAG DocumentElements + for item, level in doc.iterate_items(): + content = getattr(item, "text", "") + if not content and hasattr(item, "export_to_markdown"): + try: + content = item.export_to_markdown() + except Exception: + pass + + if not content: + continue + + full_text_blocks.append(content) + label = getattr(item, "label", None) + + if label in (DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER): + # Update breadcrumbs + while current_breadcrumbs and current_breadcrumbs[-1][0] >= level: + current_breadcrumbs.pop() + current_breadcrumbs.append((level, content)) + + elements.append( + DocumentElement( + type="header", + level=level, + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + elif label in (DocItemLabel.PARAGRAPH, DocItemLabel.TEXT): + elements.append( + DocumentElement( + type="paragraph", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + elif label == DocItemLabel.LIST_ITEM: + elements.append( + DocumentElement( + type="list", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + elif label == DocItemLabel.TABLE: + elements.append( + DocumentElement( + type="table", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + elif label == DocItemLabel.CODE: + elements.append( + DocumentElement( + type="code", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + ) + ) + else: + # Default for CAPTION, FOOTNOTE, etc. + elements.append( + DocumentElement( + type="paragraph", + content=content, + breadcrumbs=[b[1] for b in current_breadcrumbs], + metadata={"label": label.value if hasattr(label, "value") else label}, + ) + ) + + full_text = "\n\n".join(full_text_blocks) + + return DocumentOutput( + text=full_text, + document_info=DocumentInfo( + path=str(path), + metadata={ + "size_bytes": path.stat().st_size, + "loader": self.extension_name, + "suffix": path.suffix, + }, + ), + elements=elements, + ) diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py new file mode 100644 index 00000000..731a5149 --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: DOCX Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class DocxLoader(DoclingBaseLoader): + """Load text and structural elements from a DOCX file using Docling.""" + + @property + def extension_name(self) -> str: + return "docx" diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py new file mode 100644 index 00000000..906b50a9 --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: HTML Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class HtmlLoader(DoclingBaseLoader): + """Load text and structural elements from an HTML/XHTML file using Docling.""" + + @property + def extension_name(self) -> str: + return "html" diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py new file mode 100644 index 00000000..a2a6fcb0 --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: PPTX Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class PptxLoader(DoclingBaseLoader): + """Load text and structural elements from a PPTX file using Docling.""" + + @property + def extension_name(self) -> str: + return "pptx" diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py new file mode 100644 index 00000000..3b09e9b8 --- /dev/null +++ b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py @@ -0,0 +1,12 @@ +# GraphRAG SDK — Ingestion: XLSX Loader +# Pattern: Strategy + +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + + +class XlsxLoader(DoclingBaseLoader): + """Load text and structural elements from an XLSX file using Docling.""" + + @property + def extension_name(self) -> str: + return "xlsx" diff --git a/graphrag_sdk/test_debug.py b/graphrag_sdk/test_debug.py new file mode 100644 index 00000000..ee4d839f --- /dev/null +++ b/graphrag_sdk/test_debug.py @@ -0,0 +1,28 @@ +import asyncio +from unittest.mock import MagicMock +from graphrag_sdk.core.context import Context +from tests.test_docling_loaders import MockDocxLoader, LabelEnum + +loader = MockDocxLoader() +mock_items = [ + (MagicMock(label=LabelEnum.LIST_ITEM, text="List item 1"), 1), + (MagicMock(label=LabelEnum.TABLE, text="Table content"), 1), + (MagicMock(label=LabelEnum.CODE, text="print('hello')"), 1), +] + +mock_doc = MagicMock() +mock_doc.iterate_items.return_value = mock_items + +mock_converter = MagicMock() +mock_converter.convert.return_value.document = mock_doc + +# Force the monkeypatch manually +import sys +sys.modules["docling"] = MagicMock() +sys.modules["docling.datamodel"] = MagicMock() +sys.modules["docling.datamodel.document"] = MagicMock(DocItemLabel=LabelEnum) +sys.modules["docling.document_converter"] = MagicMock(DocumentConverter=MagicMock(return_value=mock_converter)) + +ctx = Context() +result = loader._load_sync("dummy_path") +print(result.elements) diff --git a/graphrag_sdk/test_monkeypatch.py b/graphrag_sdk/test_monkeypatch.py new file mode 100644 index 00000000..4dd8ca2b --- /dev/null +++ b/graphrag_sdk/test_monkeypatch.py @@ -0,0 +1,19 @@ +import sys, asyncio +from unittest.mock import patch, MagicMock + +async def main(): + mock_docling = MagicMock() + # mock_docling.__path__ = [] # Let's see without this + modules = { + 'docling': mock_docling, + 'docling.datamodel': MagicMock(), + 'docling.datamodel.document': MagicMock() + } + with patch.dict('sys.modules', modules): + def worker(): + from docling.datamodel.document import DocItemLabel + return 'success' + res = await asyncio.to_thread(worker) + print(res) + +asyncio.run(main()) diff --git a/graphrag_sdk/test_monkeypatch2.py b/graphrag_sdk/test_monkeypatch2.py new file mode 100644 index 00000000..e9c35d70 --- /dev/null +++ b/graphrag_sdk/test_monkeypatch2.py @@ -0,0 +1,35 @@ +import sys, asyncio, pytest +from unittest.mock import patch, MagicMock + +async def load_sync(): + from docling.datamodel.document import DocItemLabel + return "success" + +async def test_first(): + real_import = __import__ + def _import(name, *args, **kwargs): + if name == "docling.datamodel.document": + raise ImportError("module not found") + return real_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=_import): + try: + await asyncio.to_thread(load_sync) + except Exception as e: + pass # catch the mocked exception + +async def test_second(): + res = await asyncio.to_thread(load_sync) + print("Second test:", res) + +async def main(): + modules = { + 'docling': MagicMock(), + 'docling.datamodel': MagicMock(), + 'docling.datamodel.document': MagicMock() + } + with patch.dict('sys.modules', modules): + await test_first() + await test_second() + +asyncio.run(main()) diff --git a/graphrag_sdk/test_monkeypatch3.py b/graphrag_sdk/test_monkeypatch3.py new file mode 100644 index 00000000..03bd0823 --- /dev/null +++ b/graphrag_sdk/test_monkeypatch3.py @@ -0,0 +1,35 @@ +import sys, asyncio +from unittest.mock import patch, MagicMock + +def load_sync(): + from docling.datamodel.document import DocItemLabel + return "success" + +async def test_first(): + real_import = __import__ + def _import(name, *args, **kwargs): + if name == "docling.datamodel.document": + raise ImportError("module not found") + return real_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=_import): + try: + await asyncio.to_thread(load_sync) + except Exception as e: + pass # catch the mocked exception + +async def test_second(): + res = await asyncio.to_thread(load_sync) + print("Second test:", res) + +async def main(): + modules = { + 'docling': MagicMock(), + 'docling.datamodel': MagicMock(), + 'docling.datamodel.document': MagicMock() + } + with patch.dict('sys.modules', modules): + await test_first() + await test_second() + +asyncio.run(main()) diff --git a/graphrag_sdk/tests/test_docling_loaders.py b/graphrag_sdk/tests/test_docling_loaders.py new file mode 100644 index 00000000..da8022c7 --- /dev/null +++ b/graphrag_sdk/tests/test_docling_loaders.py @@ -0,0 +1,214 @@ +import pytest +import sys +from unittest.mock import MagicMock, patch +from graphrag_sdk.core.exceptions import LoaderError +from graphrag_sdk.core.models import DocumentOutput, DocumentInfo, DocumentElement +from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader + +class MockDocxLoader(DoclingBaseLoader): + @property + def extension_name(self) -> str: + return "docx" + +class MockLabel: + def __init__(self, val): self.value = val + def __eq__(self, other): + if isinstance(other, type) and hasattr(other, "SECTION_HEADER"): + return False # Not a direct match + return isinstance(other, MockLabel) and self.value == other.value + +class LabelEnum: + SECTION_HEADER = MockLabel("section_header") + PARAGRAPH = MockLabel("paragraph") + FOOTNOTE = MockLabel("footnote") + TITLE = MockLabel("title") + TEXT = MockLabel("text") + LIST_ITEM = MockLabel("list_item") + TABLE = MockLabel("table") + CODE = MockLabel("code") + +class TestDoclingBaseLoader: + """Tests for DoclingBaseLoader and its derived loaders.""" + + @pytest.fixture(autouse=True) + def mock_docling_modules(self): + """Mock the docling module namespace in sys.modules.""" + mock_datamodel = MagicMock() + mock_datamodel.DocItemLabel = LabelEnum + + mock_converter_mod = MagicMock() + mock_converter_mod.DocumentConverter = MagicMock() + + mock_docling = MagicMock() + mock_docling.__path__ = [] + + mock_datamodel_pkg = MagicMock() + mock_datamodel_pkg.__path__ = [] + + modules = { + "docling": mock_docling, + "docling.datamodel": mock_datamodel_pkg, + "docling.datamodel.document": mock_datamodel, + "docling.document_converter": mock_converter_mod, + } + + with patch.dict("sys.modules", modules): + yield + + async def test_import_error_wrapped_in_loader_error(self, ctx, tmp_path): + """Verify that ImportError when docling is missing is wrapped in LoaderError.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + loader = MockDocxLoader() + + # Mocking the import to raise ImportError + real_import = __import__ + def _import(name, *args, **kwargs): + if name == "docling.document_converter": + raise ImportError("module not found") + return real_import(name, *args, **kwargs) + + with patch("builtins.__import__", side_effect=_import): + with pytest.raises(LoaderError, match=r"DOCX parsing requires 'docling'"): + await loader.load(str(file), ctx) + + async def test_label_mapping_and_metadata_preservation(self, ctx, tmp_path): + """Verify mapping of DocItemLabel and preservation of labels in metadata for fallback cases.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + # Mock Docling's result structure + mock_item_header = MagicMock() + mock_item_header.label = LabelEnum.SECTION_HEADER + mock_item_header.text = "Header 1" + + mock_item_para = MagicMock() + mock_item_para.label = LabelEnum.PARAGRAPH + mock_item_para.text = "Paragraph 1" + + mock_item_footnote = MagicMock() + mock_item_footnote.label = LabelEnum.FOOTNOTE + mock_item_footnote.text = "Footnote content" + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = [ + (mock_item_header, 1), + (mock_item_para, 2), + (mock_item_footnote, 2), + ] + + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter + + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + + elements = result.elements + assert len(elements) == 3 + assert elements[0].type == "header" + assert elements[0].content == "Header 1" + assert elements[1].type == "paragraph" + assert elements[1].content == "Paragraph 1" + # Check fallback and metadata preservation + assert elements[2].type == "paragraph" + assert elements[2].content == "Footnote content" + assert elements[2].metadata["label"] == LabelEnum.FOOTNOTE.value + + async def test_breadcrumbs_construction(self, ctx, tmp_path): + """Verify the breadcrumbs are built correctly following the header hierarchy.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + # Hierarchy: H1 -> H2 -> P -> H2 (new) -> P + mock_items = [] + mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Root"), 1)) + mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Child"), 2)) + mock_items.append((MagicMock(label=LabelEnum.PARAGRAPH, text="Text 1"), 3)) + mock_items.append((MagicMock(label=LabelEnum.SECTION_HEADER, text="Sibling"), 2)) + mock_items.append((MagicMock(label=LabelEnum.PARAGRAPH, text="Text 2"), 3)) + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = mock_items + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter + + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + + elements = result.elements + assert elements[0].breadcrumbs == ["Root"] + assert elements[1].breadcrumbs == ["Root", "Child"] + assert elements[2].breadcrumbs == ["Root", "Child"] + assert elements[3].breadcrumbs == ["Root", "Sibling"] + assert elements[4].breadcrumbs == ["Root", "Sibling"] + + async def test_file_not_found(self, ctx): + """Verify that LoaderError is raised when the file does not exist.""" + loader = MockDocxLoader() + with pytest.raises(LoaderError, match="File not found"): + await loader.load("/non/existent/path.docx", ctx) + + async def test_docling_conversion_failure(self, ctx, tmp_path): + """Verify that exceptions during docling conversion are wrapped in LoaderError.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + mock_converter = MagicMock() + mock_converter.convert.side_effect = Exception("Conversion failed") + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter + + loader = MockDocxLoader() + with pytest.raises(LoaderError, match="Docling failed to process"): + await loader.load(str(file), ctx) + + async def test_export_to_markdown_fallback(self, ctx, tmp_path): + """Verify fallback to export_to_markdown when text attribute is empty.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + mock_item = MagicMock() + mock_item.label = LabelEnum.PARAGRAPH + mock_item.text = "" + mock_item.export_to_markdown.return_value = "Fallback Markdown Content" + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = [(mock_item, 1)] + + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter + + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + + assert len(result.elements) == 1 + assert result.elements[0].content == "Fallback Markdown Content" + + async def test_specialized_element_types(self, ctx, tmp_path): + """Verify mapping of list, table, and code elements.""" + file = tmp_path / "test.docx" + file.write_text("dummy content") + + mock_items = [ + (MagicMock(label=LabelEnum.LIST_ITEM, text="List item 1"), 1), + (MagicMock(label=LabelEnum.TABLE, text="Table content"), 1), + (MagicMock(label=LabelEnum.CODE, text="print('hello')"), 1), + ] + + mock_doc = MagicMock() + mock_doc.iterate_items.return_value = mock_items + + mock_converter = MagicMock() + mock_converter.convert.return_value.document = mock_doc + sys.modules["docling.document_converter"].DocumentConverter = lambda **kwargs: mock_converter + + loader = MockDocxLoader() + result = await loader.load(str(file), ctx) + + elements = result.elements + assert elements[0].type == "list" + assert elements[1].type == "table" + assert elements[2].type == "code" diff --git a/test_import_mock.py b/test_import_mock.py new file mode 100644 index 00000000..be131461 --- /dev/null +++ b/test_import_mock.py @@ -0,0 +1,15 @@ +import sys +from unittest.mock import patch, MagicMock + +modules = {"fake": MagicMock()} +with patch.dict("sys.modules", modules): + def _import(name, *args, **kwargs): + if name == "fake": + raise ImportError("module not found") + return __import__(name, *args, **kwargs) + with patch("builtins.__import__", side_effect=_import): + try: + import fake + print("Import succeeded") + except ImportError: + print("ImportError raised") diff --git a/test_mock_import.py b/test_mock_import.py new file mode 100644 index 00000000..681b8260 --- /dev/null +++ b/test_mock_import.py @@ -0,0 +1,15 @@ +import sys +from unittest.mock import MagicMock +class LabelEnum: + LIST_ITEM = "list_item" + +mock_datamodel = MagicMock() +mock_datamodel.DocItemLabel = LabelEnum + +sys.modules["docling"] = MagicMock() +sys.modules["docling.datamodel"] = MagicMock() +sys.modules["docling.datamodel.document"] = mock_datamodel + +from docling.datamodel.document import DocItemLabel +print("DocItemLabel is:", DocItemLabel) +print("DocItemLabel.LIST_ITEM is:", getattr(DocItemLabel, "LIST_ITEM", None)) diff --git a/test_patch_dict.py b/test_patch_dict.py new file mode 100644 index 00000000..44bf9000 --- /dev/null +++ b/test_patch_dict.py @@ -0,0 +1,5 @@ +import sys +from unittest.mock import patch, MagicMock +with patch.dict("sys.modules", {"docling": MagicMock(), "docling.document_converter": MagicMock()}): + import docling.document_converter + print("Success patch.dict!") diff --git a/test_sys_modules.py b/test_sys_modules.py new file mode 100644 index 00000000..613805cf --- /dev/null +++ b/test_sys_modules.py @@ -0,0 +1,6 @@ +import sys +from unittest.mock import MagicMock +sys.modules['docling'] = MagicMock() +sys.modules['docling.document_converter'] = MagicMock() +import docling.document_converter +print("Success!")