diff --git a/graphrag_sdk/src/graphrag_sdk/api/main.py b/graphrag_sdk/src/graphrag_sdk/api/main.py index 9b2486c..684c7a2 100644 --- a/graphrag_sdk/src/graphrag_sdk/api/main.py +++ b/graphrag_sdk/src/graphrag_sdk/api/main.py @@ -31,7 +31,9 @@ ) from graphrag_sdk.core.providers import Embedder, LLMInterface from graphrag_sdk.ingestion.chunking_strategies.base import ChunkingStrategy -from graphrag_sdk.ingestion.chunking_strategies.fixed_size import FixedSizeChunking +from graphrag_sdk.ingestion.chunking_strategies.sentence_token_cap import ( + SentenceTokenCapChunking, +) from graphrag_sdk.ingestion.extraction_strategies.base import ExtractionStrategy from graphrag_sdk.ingestion.extraction_strategies.graph_extraction import GraphExtraction from graphrag_sdk.ingestion.loaders.base import LoaderStrategy @@ -320,7 +322,10 @@ async def ingest( Uses sensible defaults for any unspecified strategy: - Loader: auto-detected from file extension (PDF or text) - - Chunker: FixedSizeChunking(chunk_size=1000) + - Chunker: SentenceTokenCapChunking(max_tokens=512, overlap_sentences=2) + — sentence-aware, never splits entity names at chunk boundaries. + Override with ``chunker=FixedSizeChunking(...)`` if you need + character-window chunking. - Extractor: GraphExtraction with configured LLM - Resolver: ExactMatchResolution @@ -529,7 +534,7 @@ async def _ingest_single( pipeline = IngestionPipeline( loader=loader or TextLoader(), - chunker=chunker or FixedSizeChunking(), + chunker=chunker or SentenceTokenCapChunking(), extractor=extractor or self._default_extractor(), resolver=resolver or ExactMatchResolution(), graph_store=self._graph_store, @@ -1010,7 +1015,7 @@ async def update( pipeline = IngestionPipeline( loader=loader or TextLoader(), # unused (text is provided below) - chunker=chunker or FixedSizeChunking(), + chunker=chunker or SentenceTokenCapChunking(), extractor=extractor or self._default_extractor(), resolver=resolver or ExactMatchResolution(), graph_store=self._graph_store, @@ -1271,7 +1276,7 @@ async def apply_changes( to ``ingest()`` and ``update()``). Defaults to per-extension auto-selection. ``deleted`` ignores this. chunker: Override the chunking strategy for ``added``/``modified``. - Defaults to ``FixedSizeChunking``. ``deleted`` ignores this. + Defaults to ``SentenceTokenCapChunking``. ``deleted`` ignores this. extractor: Override the entity-extraction strategy for ``added``/``modified``. ``deleted`` ignores this. resolver: Override the resolution strategy for ``added``/