Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added graphrag_sdk/dummy_path
Empty file.
8 changes: 8 additions & 0 deletions graphrag_sdk/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,15 @@ litellm = ["litellm>=1.83.0,<2.0"]
openrouter = ["openai>=1.0,<2.0"]
fastcoref = ["fastcoref>=2.0"]
spacy = ["spacy>=3.0"]
docling = ["docling>=2.91.0"]
all = [
"openai>=1.0,<2.0",
"anthropic>=0.20,<1.0",
"cohere>=5.0",
"sentence-transformers>=2.0",
"pypdf>=6.9.2",
"litellm>=1.83.0,<2.0",
"docling>=2.91.0",
]
dev = [
"pytest>=8.0",
Expand Down Expand Up @@ -97,3 +99,9 @@ plugins = ["pydantic.mypy"]
[tool.pytest.ini_options]
asyncio_mode = "auto"
testpaths = ["tests"]
filterwarnings = [
"ignore:builtin type SwigPyObject has no __module__ attribute:DeprecationWarning",
"ignore:.*hf_xet.download_files\\(\\) is deprecated.*:DeprecationWarning",
"ignore:.*`torch.jit.script` is deprecated.*:DeprecationWarning",
"ignore:.*The `resume_download` argument is deprecated.*:UserWarning"
]
15 changes: 15 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,14 @@
from graphrag_sdk.ingestion.extraction_strategies.base import ExtractionStrategy
from graphrag_sdk.ingestion.extraction_strategies.graph_extraction import GraphExtraction
from graphrag_sdk.ingestion.loaders.base import LoaderStrategy
from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader
from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader
from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader
from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader
from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader
from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader
from graphrag_sdk.ingestion.loaders.text_loader import TextLoader
from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader
from graphrag_sdk.ingestion.pipeline import IngestionPipeline
from graphrag_sdk.ingestion.resolution_strategies.base import ResolutionStrategy
from graphrag_sdk.ingestion.resolution_strategies.exact_match import ExactMatchResolution
Expand Down Expand Up @@ -637,6 +642,16 @@ def _default_loader_for(source: str) -> LoaderStrategy:
return PdfLoader()
if lower.endswith(".md"):
return MarkdownLoader()
if lower.endswith(".docx"):
return DocxLoader()
if lower.endswith(".xlsx"):
return XlsxLoader()
if lower.endswith(".pptx"):
return PptxLoader()
if lower.endswith(".html") or lower.endswith(".xhtml"):
return HtmlLoader()
if lower.endswith(".csv"):
return CsvLoader()
return TextLoader()

# ── Incremental Updates ─────────────────────────────────────
Expand Down
17 changes: 16 additions & 1 deletion graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
# GraphRAG SDK — Ingestion: Loaders

from graphrag_sdk.ingestion.loaders.base import LoaderStrategy
from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader
from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader
from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader
from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader
from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader
from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader
from graphrag_sdk.ingestion.loaders.text_loader import TextLoader
from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader

__all__ = ["LoaderStrategy", "MarkdownLoader", "PdfLoader", "TextLoader"]
__all__ = [
"LoaderStrategy",
"MarkdownLoader",
"PdfLoader",
"TextLoader",
"DocxLoader",
"XlsxLoader",
"PptxLoader",
"HtmlLoader",
"CsvLoader",
]
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: CSV Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class CsvLoader(DoclingBaseLoader):
"""Load text and structural elements from a CSV file using Docling."""

@property
def extension_name(self) -> str:
return "csv"
154 changes: 154 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# GraphRAG SDK — Ingestion: Docling Base Loader
# Pattern: Strategy Base Class

from __future__ import annotations

import asyncio
import logging
from pathlib import Path
from typing import Any

from graphrag_sdk.core.context import Context
from graphrag_sdk.core.exceptions import LoaderError
from graphrag_sdk.core.models import DocumentElement, DocumentInfo, DocumentOutput
from graphrag_sdk.ingestion.loaders.base import LoaderStrategy

logger = logging.getLogger(__name__)


class DoclingBaseLoader(LoaderStrategy):
"""Base loader using docling for advanced document parsing.

Subclasses should define the `extension_name` property.
"""

def __init__(self, **docling_kwargs: Any) -> None:
"""Initialize the loader.

Args:
**docling_kwargs: Arbitrary keyword arguments passed to
`docling.document_converter.DocumentConverter` (e.g.,
pipeline_options).
"""
self.docling_kwargs = docling_kwargs

@property
def extension_name(self) -> str:
return "unknown"

async def load(self, source: str, ctx: Context) -> DocumentOutput:
ctx.log(f"Loading {self.extension_name.upper()} file via docling: {source}")
# Run synchronous docling extraction in a non-blocking thread
return await asyncio.to_thread(self._load_sync, source)

def _load_sync(self, source: str) -> DocumentOutput:
path = Path(source)
if not path.exists():
raise LoaderError(f"File not found: {source}")

try:
from docling.datamodel.document import DocItemLabel
from docling.document_converter import DocumentConverter
except ImportError:
raise LoaderError(
f"{self.extension_name.upper()} parsing requires 'docling'. Install with:\n"
" pip install graphrag-sdk[docling]"
)

try:
converter = DocumentConverter(**self.docling_kwargs)
result = converter.convert(source)
doc = result.document
except Exception as exc:
raise LoaderError(f"Docling failed to process {source}: {exc}") from exc

elements: list[DocumentElement] = []
current_breadcrumbs: list[tuple[int, str]] = []
full_text_blocks = []

# Map docling hierarchy to GraphRAG DocumentElements
for item, level in doc.iterate_items():
content = getattr(item, "text", "")
if not content and hasattr(item, "export_to_markdown"):
try:
content = item.export_to_markdown()
except Exception:
pass

if not content:
continue

full_text_blocks.append(content)
label = getattr(item, "label", None)

if label in (DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER):
# Update breadcrumbs
while current_breadcrumbs and current_breadcrumbs[-1][0] >= level:
current_breadcrumbs.pop()
current_breadcrumbs.append((level, content))

elements.append(
DocumentElement(
type="header",
level=level,
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
elif label in (DocItemLabel.PARAGRAPH, DocItemLabel.TEXT):
elements.append(
DocumentElement(
type="paragraph",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
elif label == DocItemLabel.LIST_ITEM:
elements.append(
DocumentElement(
type="list",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
elif label == DocItemLabel.TABLE:
elements.append(
DocumentElement(
type="table",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
elif label == DocItemLabel.CODE:
elements.append(
DocumentElement(
type="code",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
else:
# Default for CAPTION, FOOTNOTE, etc.
elements.append(
DocumentElement(
type="paragraph",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
metadata={"label": label.value if hasattr(label, "value") else label},
)
)

full_text = "\n\n".join(full_text_blocks)

return DocumentOutput(
text=full_text,
document_info=DocumentInfo(
path=str(path),
metadata={
"size_bytes": path.stat().st_size,
"loader": self.extension_name,
"suffix": path.suffix,
},
),
elements=elements,
)
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: DOCX Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class DocxLoader(DoclingBaseLoader):
"""Load text and structural elements from a DOCX file using Docling."""

@property
def extension_name(self) -> str:
return "docx"
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: HTML Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class HtmlLoader(DoclingBaseLoader):
"""Load text and structural elements from an HTML/XHTML file using Docling."""

@property
def extension_name(self) -> str:
return "html"
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: PPTX Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class PptxLoader(DoclingBaseLoader):
"""Load text and structural elements from a PPTX file using Docling."""

@property
def extension_name(self) -> str:
return "pptx"
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: XLSX Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class XlsxLoader(DoclingBaseLoader):
"""Load text and structural elements from an XLSX file using Docling."""

@property
def extension_name(self) -> str:
return "xlsx"
28 changes: 28 additions & 0 deletions graphrag_sdk/test_debug.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import asyncio
from unittest.mock import MagicMock
from graphrag_sdk.core.context import Context
from tests.test_docling_loaders import MockDocxLoader, LabelEnum

loader = MockDocxLoader()
mock_items = [
(MagicMock(label=LabelEnum.LIST_ITEM, text="List item 1"), 1),
(MagicMock(label=LabelEnum.TABLE, text="Table content"), 1),
(MagicMock(label=LabelEnum.CODE, text="print('hello')"), 1),
]

mock_doc = MagicMock()
mock_doc.iterate_items.return_value = mock_items

mock_converter = MagicMock()
mock_converter.convert.return_value.document = mock_doc

# Force the monkeypatch manually
import sys
sys.modules["docling"] = MagicMock()
sys.modules["docling.datamodel"] = MagicMock()
sys.modules["docling.datamodel.document"] = MagicMock(DocItemLabel=LabelEnum)
sys.modules["docling.document_converter"] = MagicMock(DocumentConverter=MagicMock(return_value=mock_converter))

ctx = Context()
result = loader._load_sync("dummy_path")
print(result.elements)
Comment thread
drr00t marked this conversation as resolved.
19 changes: 19 additions & 0 deletions graphrag_sdk/test_monkeypatch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import sys, asyncio
from unittest.mock import patch, MagicMock

async def main():
mock_docling = MagicMock()
# mock_docling.__path__ = [] # Let's see without this
modules = {
'docling': mock_docling,
'docling.datamodel': MagicMock(),
'docling.datamodel.document': MagicMock()
}
with patch.dict('sys.modules', modules):
def worker():
from docling.datamodel.document import DocItemLabel
return 'success'
res = await asyncio.to_thread(worker)
print(res)

asyncio.run(main())
35 changes: 35 additions & 0 deletions graphrag_sdk/test_monkeypatch2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import sys, asyncio, pytest
from unittest.mock import patch, MagicMock

async def load_sync():
from docling.datamodel.document import DocItemLabel
return "success"
Comment thread
drr00t marked this conversation as resolved.

async def test_first():
real_import = __import__
def _import(name, *args, **kwargs):
if name == "docling.datamodel.document":
raise ImportError("module not found")
return real_import(name, *args, **kwargs)

with patch("builtins.__import__", side_effect=_import):
try:
await asyncio.to_thread(load_sync)
except Exception as e:
pass # catch the mocked exception

async def test_second():
res = await asyncio.to_thread(load_sync)
print("Second test:", res)

async def main():
modules = {
'docling': MagicMock(),
'docling.datamodel': MagicMock(),
'docling.datamodel.document': MagicMock()
}
with patch.dict('sys.modules', modules):
await test_first()
await test_second()

asyncio.run(main())
Loading
Loading