FalkorDB · drr00t · May 17, 2026 · May 14, 2026 · May 19, 2026 · May 19, 2026
diff --git a/graphrag_sdk/pyproject.toml b/graphrag_sdk/pyproject.toml
@@ -56,13 +56,15 @@ litellm = ["litellm>=1.83.0,<2.0"]
 openrouter = ["openai>=1.0,<2.0"]
 fastcoref = ["fastcoref>=2.0"]
 spacy = ["spacy>=3.0"]
+docling = ["docling>=2.91.0"]
 all = [
     "openai>=1.0,<2.0",
     "anthropic>=0.20,<1.0",
     "cohere>=5.0",
     "sentence-transformers>=2.0",
     "pypdf>=6.9.2",
     "litellm>=1.83.0,<2.0",
+    "docling>=2.0.0",
 ]
 dev = [
     "pytest>=8.0",

diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py
@@ -31,13 +31,20 @@
 )
 from graphrag_sdk.core.providers import Embedder, LLMInterface
 from graphrag_sdk.ingestion.chunking_strategies.base import ChunkingStrategy
-from graphrag_sdk.ingestion.chunking_strategies.fixed_size import FixedSizeChunking
+from graphrag_sdk.ingestion.chunking_strategies.sentence_token_cap import (
+    SentenceTokenCapChunking,
+)
 from graphrag_sdk.ingestion.extraction_strategies.base import ExtractionStrategy
 from graphrag_sdk.ingestion.extraction_strategies.graph_extraction import GraphExtraction
 from graphrag_sdk.ingestion.loaders.base import LoaderStrategy
+from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader
+from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader
+from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader
 from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader
 from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader
+from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader
 from graphrag_sdk.ingestion.loaders.text_loader import TextLoader
+from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader
 from graphrag_sdk.ingestion.pipeline import IngestionPipeline
 from graphrag_sdk.ingestion.resolution_strategies.base import ResolutionStrategy
 from graphrag_sdk.ingestion.resolution_strategies.exact_match import ExactMatchResolution
@@ -320,7 +327,10 @@ async def ingest(
 
         Uses sensible defaults for any unspecified strategy:
         - Loader: auto-detected from file extension (PDF or text)
-        - Chunker: FixedSizeChunking(chunk_size=1000)
+        - Chunker: SentenceTokenCapChunking(max_tokens=512, overlap_sentences=2)
+          — sentence-aware, never splits entity names at chunk boundaries.
+          Override with ``chunker=FixedSizeChunking(...)`` if you need
+          character-window chunking.
         - Extractor: GraphExtraction with configured LLM
         - Resolver: ExactMatchResolution
 
@@ -529,7 +539,7 @@ async def _ingest_single(
 
         pipeline = IngestionPipeline(
             loader=loader or TextLoader(),
-            chunker=chunker or FixedSizeChunking(),
+            chunker=chunker or SentenceTokenCapChunking(),
             extractor=extractor or self._default_extractor(),
             resolver=resolver or ExactMatchResolution(),
             graph_store=self._graph_store,
@@ -632,6 +642,16 @@ def _default_loader_for(source: str) -> LoaderStrategy:
             return PdfLoader()
         if lower.endswith(".md"):
             return MarkdownLoader()
+        if lower.endswith(".docx"):
+            return DocxLoader()
+        if lower.endswith(".xlsx"):
+            return XlsxLoader()
+        if lower.endswith(".pptx"):
+            return PptxLoader()
+        if lower.endswith(".html") or lower.endswith(".xhtml"):
+            return HtmlLoader()
+        if lower.endswith(".csv"):
+            return CsvLoader()
         return TextLoader()
 
     # ── Incremental Updates ─────────────────────────────────────
@@ -1010,7 +1030,7 @@ async def update(
 
         pipeline = IngestionPipeline(
             loader=loader or TextLoader(),  # unused (text is provided below)
-            chunker=chunker or FixedSizeChunking(),
+            chunker=chunker or SentenceTokenCapChunking(),
             extractor=extractor or self._default_extractor(),
             resolver=resolver or ExactMatchResolution(),
             graph_store=self._graph_store,
@@ -1271,7 +1291,7 @@ async def apply_changes(
                 to ``ingest()`` and ``update()``). Defaults to per-extension
                 auto-selection. ``deleted`` ignores this.
             chunker: Override the chunking strategy for ``added``/``modified``.
-                Defaults to ``FixedSizeChunking``. ``deleted`` ignores this.
+                Defaults to ``SentenceTokenCapChunking``. ``deleted`` ignores this.
             extractor: Override the entity-extraction strategy for
                 ``added``/``modified``. ``deleted`` ignores this.
             resolver: Override the resolution strategy for ``added``/

diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/__init__.py
@@ -1,8 +1,23 @@
 # GraphRAG SDK — Ingestion: Loaders
 
 from graphrag_sdk.ingestion.loaders.base import LoaderStrategy
+from graphrag_sdk.ingestion.loaders.csv_loader import CsvLoader
+from graphrag_sdk.ingestion.loaders.docx_loader import DocxLoader
+from graphrag_sdk.ingestion.loaders.html_loader import HtmlLoader
 from graphrag_sdk.ingestion.loaders.markdown_loader import MarkdownLoader
 from graphrag_sdk.ingestion.loaders.pdf_loader import PdfLoader
+from graphrag_sdk.ingestion.loaders.pptx_loader import PptxLoader
 from graphrag_sdk.ingestion.loaders.text_loader import TextLoader
+from graphrag_sdk.ingestion.loaders.xlsx_loader import XlsxLoader
 
-__all__ = ["LoaderStrategy", "MarkdownLoader", "PdfLoader", "TextLoader"]
+__all__ = [
+    "LoaderStrategy",
+    "MarkdownLoader",
+    "PdfLoader",
+    "TextLoader",
+    "DocxLoader",
+    "XlsxLoader",
+    "PptxLoader",
+    "HtmlLoader",
+    "CsvLoader",
+]
diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/csv_loader.py
@@ -0,0 +1,12 @@
+# GraphRAG SDK — Ingestion: CSV Loader
+# Pattern: Strategy
+
+from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader
+
+
+class CsvLoader(DoclingBaseLoader):
+    """Load text and structural elements from a CSV file using Docling."""
+
+    @property
+    def extension_name(self) -> str:
+        return "csv"
diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docling_base.py
@@ -0,0 +1,154 @@
+# GraphRAG SDK — Ingestion: Docling Base Loader
+# Pattern: Strategy Base Class
+
+from __future__ import annotations
+
+import asyncio
+import logging
+from pathlib import Path
+from typing import Any
+
+from graphrag_sdk.core.context import Context
+from graphrag_sdk.core.exceptions import LoaderError
+from graphrag_sdk.core.models import DocumentElement, DocumentInfo, DocumentOutput
+from graphrag_sdk.ingestion.loaders.base import LoaderStrategy
+
+logger = logging.getLogger(__name__)
+
+
+class DoclingBaseLoader(LoaderStrategy):
+    """Base loader using docling for advanced document parsing.
+
+    Subclasses should define the `extension_name` property.
+    """
+
+    def __init__(self, **docling_kwargs: Any) -> None:
+        """Initialize the loader.
+
+        Args:
+            **docling_kwargs: Arbitrary keyword arguments passed to
+                `docling.document_converter.DocumentConverter` (e.g.,
+                pipeline_options).
+        """
+        self.docling_kwargs = docling_kwargs
+
+    @property
+    def extension_name(self) -> str:
+        return "unknown"
+
+    async def load(self, source: str, ctx: Context) -> DocumentOutput:
+        ctx.log(f"Loading {self.extension_name.upper()} file via docling: {source}")
+        # Run synchronous docling extraction in a non-blocking thread
+        return await asyncio.to_thread(self._load_sync, source)
+
+    def _load_sync(self, source: str) -> DocumentOutput:
+        path = Path(source)
+        if not path.exists():
+            raise LoaderError(f"File not found: {source}")
+
+        try:
+            from docling.datamodel.document import DocItemLabel
+            from docling.document_converter import DocumentConverter
+        except ImportError:
+            raise LoaderError(
+                f"{self.extension_name.upper()} parsing requires 'docling'. Install with:\n"
+                "  pip install graphrag-sdk[docling]"
+            )
+
+        try:
+            converter = DocumentConverter(**self.docling_kwargs)
+            result = converter.convert(source)
+            doc = result.document
+        except Exception as exc:
+            raise LoaderError(f"Docling failed to process {source}: {exc}") from exc
+
+        elements: list[DocumentElement] = []
+        current_breadcrumbs: list[tuple[int, str]] = []
+        full_text_blocks = []
+
+        # Map docling hierarchy to GraphRAG DocumentElements
+        for item, level in doc.iterate_items():
+            content = getattr(item, "text", "")
+            if not content and hasattr(item, "export_to_markdown"):
+                try:
+                    content = item.export_to_markdown()
+                except Exception:
+                    pass
+
+            if not content:
+                continue
+
+            full_text_blocks.append(content)
+            label = getattr(item, "label", None)
+
+            if label in (DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER):
+                # Update breadcrumbs
+                while current_breadcrumbs and current_breadcrumbs[-1][0] >= level:
+                    current_breadcrumbs.pop()
+                current_breadcrumbs.append((level, content))
+
+                elements.append(
+                    DocumentElement(
+                        type="header",
+                        level=level,
+                        content=content,
+                        breadcrumbs=[b[1] for b in current_breadcrumbs],
+                    )
+                )
+            elif label in (DocItemLabel.PARAGRAPH, DocItemLabel.TEXT):
+                elements.append(
+                    DocumentElement(
+                        type="paragraph",
+                        content=content,
+                        breadcrumbs=[b[1] for b in current_breadcrumbs],
+                    )
+                )
+            elif label == DocItemLabel.LIST_ITEM:
+                elements.append(
+                    DocumentElement(
+                        type="list",
+                        content=content,
+                        breadcrumbs=[b[1] for b in current_breadcrumbs],
+                    )
+                )
+            elif label == DocItemLabel.TABLE:
+                elements.append(
+                    DocumentElement(
+                        type="table",
+                        content=content,
+                        breadcrumbs=[b[1] for b in current_breadcrumbs],
+                    )
+                )
+            elif label == DocItemLabel.CODE:
+                elements.append(
+                    DocumentElement(
+                        type="code",
+                        content=content,
+                        breadcrumbs=[b[1] for b in current_breadcrumbs],
+                    )
+                )
+            else:
+                # Default for CAPTION, FOOTNOTE, etc.
+                elements.append(
+                    DocumentElement(
+                        type="paragraph",
+                        content=content,
+                        breadcrumbs=[b[1] for b in current_breadcrumbs],
+                        metadata={"label": label.value if hasattr(label, "value") else label},
+                    )
+                )
+
+        full_text = "\n\n".join(full_text_blocks)
+
+        return DocumentOutput(
+            text=full_text,
+            document_info=DocumentInfo(
+                path=str(path),
+                metadata={
+                    "size_bytes": path.stat().st_size,
+                    "loader": self.extension_name,
+                    "suffix": path.suffix,
+                },
+            ),
+            elements=elements,
+        )
diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/docx_loader.py
@@ -0,0 +1,12 @@
+# GraphRAG SDK — Ingestion: DOCX Loader
+# Pattern: Strategy
+
+from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader
+
+
+class DocxLoader(DoclingBaseLoader):
+    """Load text and structural elements from a DOCX file using Docling."""
+
+    @property
+    def extension_name(self) -> str:
+        return "docx"
diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/html_loader.py
@@ -0,0 +1,12 @@
+# GraphRAG SDK — Ingestion: HTML Loader
+# Pattern: Strategy
+
+from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader
+
+
+class HtmlLoader(DoclingBaseLoader):
+    """Load text and structural elements from an HTML/XHTML file using Docling."""
+
+    @property
+    def extension_name(self) -> str:
+        return "html"
diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/pptx_loader.py
@@ -0,0 +1,12 @@
+# GraphRAG SDK — Ingestion: PPTX Loader
+# Pattern: Strategy
+
+from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader
+
+
+class PptxLoader(DoclingBaseLoader):
+    """Load text and structural elements from a PPTX file using Docling."""
+
+    @property
+    def extension_name(self) -> str:
+        return "pptx"
diff --git a/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py b/graphrag_sdk/src/graphrag_sdk/ingestion/loaders/xlsx_loader.py
@@ -0,0 +1,12 @@
+# GraphRAG SDK — Ingestion: XLSX Loader
+# Pattern: Strategy
+
+from graphrag_sdk.ingestion.loaders.docling_base import DoclingBaseLoader
+
+
+class XlsxLoader(DoclingBaseLoader):
+    """Load text and structural elements from an XLSX file using Docling."""
+
+    @property
+    def extension_name(self) -> str:
+        return "xlsx"