Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions graphrag_sdk/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,15 @@ litellm = ["litellm>=1.83.0,<2.0"]
openrouter = ["openai>=1.0,<2.0"]
fastcoref = ["fastcoref>=2.0"]
spacy = ["spacy>=3.0"]
docling = ["docling>=2.91.0"]
all = [
"openai>=1.0,<2.0",
"anthropic>=0.20,<1.0",
"cohere>=5.0",
"sentence-transformers>=2.0",
"pypdf>=6.9.2",
"litellm>=1.83.0,<2.0",
"docling>=2.0.0",
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
]
dev = [
"pytest>=8.0",
Expand Down
30 changes: 25 additions & 5 deletions graphrag_sdk/src/graphrag_sdk/api/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,20 @@
)
from graphrag_sdk.core.providers import Embedder, LLMInterface
from graphrag_sdk.ingestion.chunking_strategies.base import ChunkingStrategy
from graphrag_sdk.ingestion.chunking_strategies.fixed_size import FixedSizeChunking
from graphrag_sdk.ingestion.chunking_strategies.sentence_token_cap import (
SentenceTokenCapChunking,
)
from graphrag_sdk.ingestion.extraction_strategies.base import ExtractionStrategy
from graphrag_sdk.ingestion.extraction_strategies.graph_extraction import GraphExtraction
from graphrag_sdk.ingestion.loaders.base import LoaderStrategy
from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader
from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader
from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader
from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader
from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader
from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader
from graphrag_sdk.ingestion.loaders.text_loader import TextLoader
from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader
from graphrag_sdk.ingestion.pipeline import IngestionPipeline
from graphrag_sdk.ingestion.resolution_strategies.base import ResolutionStrategy
from graphrag_sdk.ingestion.resolution_strategies.exact_match import ExactMatchResolution
Expand Down Expand Up @@ -320,7 +327,10 @@ async def ingest(

Uses sensible defaults for any unspecified strategy:
- Loader: auto-detected from file extension (PDF or text)
- Chunker: FixedSizeChunking(chunk_size=1000)
- Chunker: SentenceTokenCapChunking(max_tokens=512, overlap_sentences=2)
— sentence-aware, never splits entity names at chunk boundaries.
Override with ``chunker=FixedSizeChunking(...)`` if you need
character-window chunking.
Comment thread
drr00t marked this conversation as resolved.
- Extractor: GraphExtraction with configured LLM
- Resolver: ExactMatchResolution

Expand Down Expand Up @@ -529,7 +539,7 @@ async def _ingest_single(

pipeline = IngestionPipeline(
loader=loader or TextLoader(),
chunker=chunker or FixedSizeChunking(),
chunker=chunker or SentenceTokenCapChunking(),
extractor=extractor or self._default_extractor(),
resolver=resolver or ExactMatchResolution(),
graph_store=self._graph_store,
Expand Down Expand Up @@ -632,6 +642,16 @@ def _default_loader_for(source: str) -> LoaderStrategy:
return PdfLoader()
if lower.endswith(".md"):
return MarkdownLoader()
if lower.endswith(".docx"):
return DocxLoader()
if lower.endswith(".xlsx"):
return XlsxLoader()
if lower.endswith(".pptx"):
return PptxLoader()
if lower.endswith(".html") or lower.endswith(".xhtml"):
return HtmlLoader()
if lower.endswith(".csv"):
return CsvLoader()
return TextLoader()

# ── Incremental Updates ─────────────────────────────────────
Expand Down Expand Up @@ -1010,7 +1030,7 @@ async def update(

pipeline = IngestionPipeline(
loader=loader or TextLoader(), # unused (text is provided below)
chunker=chunker or FixedSizeChunking(),
chunker=chunker or SentenceTokenCapChunking(),
extractor=extractor or self._default_extractor(),
resolver=resolver or ExactMatchResolution(),
graph_store=self._graph_store,
Expand Down Expand Up @@ -1271,7 +1291,7 @@ async def apply_changes(
to ``ingest()`` and ``update()``). Defaults to per-extension
auto-selection. ``deleted`` ignores this.
chunker: Override the chunking strategy for ``added``/``modified``.
Defaults to ``FixedSizeChunking``. ``deleted`` ignores this.
Defaults to ``SentenceTokenCapChunking``. ``deleted`` ignores this.
extractor: Override the entity-extraction strategy for
``added``/``modified``. ``deleted`` ignores this.
resolver: Override the resolution strategy for ``added``/
Expand Down
17 changes: 16 additions & 1 deletion graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,23 @@
# GraphRAG SDK — Ingestion: Loaders

from graphrag_sdk.ingestion.loaders.base import LoaderStrategy
from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader
from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader
from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader
from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader
from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader
from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader
from graphrag_sdk.ingestion.loaders.text_loader import TextLoader
from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader

__all__ = ["LoaderStrategy", "MarkdownLoader", "PdfLoader", "TextLoader"]
__all__ = [
"LoaderStrategy",
"MarkdownLoader",
"PdfLoader",
"TextLoader",
"DocxLoader",
"XlsxLoader",
"PptxLoader",
"HtmlLoader",
"CsvLoader",
]
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: CSV Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class CsvLoader(DoclingBaseLoader):
"""Load text and structural elements from a CSV file using Docling."""

@property
def extension_name(self) -> str:
return "csv"
154 changes: 154 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# GraphRAG SDK — Ingestion: Docling Base Loader
# Pattern: Strategy Base Class

from __future__ import annotations

import asyncio
import logging
from pathlib import Path
from typing import Any

from graphrag_sdk.core.context import Context
from graphrag_sdk.core.exceptions import LoaderError
from graphrag_sdk.core.models import DocumentElement, DocumentInfo, DocumentOutput
from graphrag_sdk.ingestion.loaders.base import LoaderStrategy

logger = logging.getLogger(__name__)


class DoclingBaseLoader(LoaderStrategy):
"""Base loader using docling for advanced document parsing.

Subclasses should define the `extension_name` property.
"""

def __init__(self, **docling_kwargs: Any) -> None:
"""Initialize the loader.

Args:
**docling_kwargs: Arbitrary keyword arguments passed to
`docling.document_converter.DocumentConverter` (e.g.,
pipeline_options).
"""
self.docling_kwargs = docling_kwargs

@property
def extension_name(self) -> str:
return "unknown"

async def load(self, source: str, ctx: Context) -> DocumentOutput:
ctx.log(f"Loading {self.extension_name.upper()} file via docling: {source}")
# Run synchronous docling extraction in a non-blocking thread
return await asyncio.to_thread(self._load_sync, source)

def _load_sync(self, source: str) -> DocumentOutput:
path = Path(source)
if not path.exists():
raise LoaderError(f"File not found: {source}")

try:
from docling.datamodel.document import DocItemLabel
from docling.document_converter import DocumentConverter
except ImportError:
raise LoaderError(
f"{self.extension_name.upper()} parsing requires 'docling'. Install with:\n"
" pip install graphrag-sdk[docling]"
)

try:
converter = DocumentConverter(**self.docling_kwargs)
result = converter.convert(source)
doc = result.document
except Exception as exc:
raise LoaderError(f"Docling failed to process {source}: {exc}") from exc

elements: list[DocumentElement] = []
current_breadcrumbs: list[tuple[int, str]] = []
full_text_blocks = []

# Map docling hierarchy to GraphRAG DocumentElements
for item, level in doc.iterate_items():
content = getattr(item, "text", "")
if not content and hasattr(item, "export_to_markdown"):
try:
content = item.export_to_markdown()
except Exception:
pass

if not content:
continue

full_text_blocks.append(content)
label = getattr(item, "label", None)

if label in (DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER):
# Update breadcrumbs
while current_breadcrumbs and current_breadcrumbs[-1][0] >= level:
current_breadcrumbs.pop()
current_breadcrumbs.append((level, content))

elements.append(
DocumentElement(
type="header",
level=level,
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
elif label in (DocItemLabel.PARAGRAPH, DocItemLabel.TEXT):
elements.append(
DocumentElement(
type="paragraph",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
elif label == DocItemLabel.LIST_ITEM:
elements.append(
DocumentElement(
type="list",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
elif label == DocItemLabel.TABLE:
elements.append(
DocumentElement(
type="table",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
elif label == DocItemLabel.CODE:
elements.append(
DocumentElement(
type="code",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
)
)
else:
# Default for CAPTION, FOOTNOTE, etc.
elements.append(
DocumentElement(
type="paragraph",
content=content,
breadcrumbs=[b[1] for b in current_breadcrumbs],
metadata={"label": label.value if hasattr(label, "value") else label},
)
)

full_text = "\n\n".join(full_text_blocks)

return DocumentOutput(
text=full_text,
document_info=DocumentInfo(
path=str(path),
metadata={
"size_bytes": path.stat().st_size,
"loader": self.extension_name,
"suffix": path.suffix,
},
),
elements=elements,
)
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: DOCX Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class DocxLoader(DoclingBaseLoader):
"""Load text and structural elements from a DOCX file using Docling."""

@property
def extension_name(self) -> str:
return "docx"
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: HTML Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class HtmlLoader(DoclingBaseLoader):
"""Load text and structural elements from an HTML/XHTML file using Docling."""

@property
def extension_name(self) -> str:
return "html"
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: PPTX Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class PptxLoader(DoclingBaseLoader):
"""Load text and structural elements from a PPTX file using Docling."""

@property
def extension_name(self) -> str:
return "pptx"
12 changes: 12 additions & 0 deletions graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# GraphRAG SDK — Ingestion: XLSX Loader
# Pattern: Strategy

from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader


class XlsxLoader(DoclingBaseLoader):
"""Load text and structural elements from an XLSX file using Docling."""

@property
def extension_name(self) -> str:
return "xlsx"
Loading
Loading