diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md index 095011b77f1..5aa6945597e 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.MarkItDown/README.md @@ -30,7 +30,12 @@ using Microsoft.Extensions.DataIngestion; IngestionDocumentReader reader = new MarkItDownReader(new FileInfo(@"pathToMarkItDown.exe"), extractImages: true); -using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter()); +using IngestionPipeline pipeline = new(CreateChunker(), CreateWriter()); + +await foreach (IngestionResult result in pipeline.ProcessAsync(reader, directory, "*.pdf")) +{ + Console.WriteLine($"Processed '{result.DocumentId}'. Succeeded: {result.Succeeded}"); +} ``` ### Creating a MarkItDownMcpReader for Data Ingestion (MCP Server) @@ -44,7 +49,12 @@ using Microsoft.Extensions.DataIngestion; IngestionDocumentReader reader = new MarkItDownMcpReader(new Uri("http://localhost:3001/mcp")); -using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter()); +using IngestionPipeline pipeline = new(CreateChunker(), CreateWriter()); + +await foreach (IngestionResult result in pipeline.ProcessAsync(reader, directory, "*.*")) +{ + Console.WriteLine($"Processed '{result.DocumentId}'. Succeeded: {result.Succeeded}"); +} ``` The MarkItDown MCP server can be run using Docker: diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md index c6a2328699c..eca53973de5 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md +++ b/src/Libraries/Microsoft.Extensions.DataIngestion.Markdig/README.md @@ -27,7 +27,12 @@ using Microsoft.Extensions.DataIngestion; IngestionDocumentReader reader = new MarkdownReader(); -using IngestionPipeline pipeline = new(reader, CreateChunker(), CreateWriter()); +using IngestionPipeline pipeline = new(CreateChunker(), CreateWriter()); + +await foreach (IngestionResult result in pipeline.ProcessAsync(reader, directory, "*.md")) +{ + Console.WriteLine($"Processed '{result.DocumentId}'. Succeeded: {result.Succeeded}"); +} ``` ## Feedback & Contributing diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/CHANGELOG.md b/src/Libraries/Microsoft.Extensions.DataIngestion/CHANGELOG.md index a88260e298f..6cf3ea01560 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/CHANGELOG.md +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/CHANGELOG.md @@ -3,6 +3,7 @@ ## 10.1.0-preview.1 - Introduced `SectionChunker` class for treating each document section as a separate entity (https://github.com/dotnet/extensions/pull/7015) +- Extended `IngestionPipeline` with a new `ProcessAsync(IAsyncEnumerable)` overload that enables processing documents without a file system reader. The `IngestionDocumentReader` has been moved from the constructor to a parameter on the file-system-oriented `ProcessAsync` overloads. ## 10.0.0-preview.1 diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs index 4251bef6ae3..884628d569a 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/DiagnosticsConstants.cs @@ -22,6 +22,16 @@ internal static class ProcessFiles internal const string FileCountTagName = "rag.file.count"; } + internal static class ProcessDocuments + { + internal const string ActivityName = "ProcessDocuments"; + } + + internal static class ProcessDocument + { + internal const string ActivityName = "ProcessDocument"; + } + internal static class ProcessSource { internal const string DocumentIdTagName = "rag.document.id"; diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs index 1eeb94058ee..806843db03e 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/IngestionPipeline.cs @@ -24,7 +24,6 @@ namespace Microsoft.Extensions.DataIngestion; /// The type of the chunk content. public sealed class IngestionPipeline : IDisposable { - private readonly IngestionDocumentReader _reader; private readonly IngestionChunker _chunker; private readonly IngestionChunkWriter _writer; private readonly ActivitySource _activitySource; @@ -33,19 +32,16 @@ public sealed class IngestionPipeline : IDisposable /// /// Initializes a new instance of the class. /// - /// The reader for ingestion documents. /// The chunker to split documents into chunks. /// The writer for processing chunks. /// The options for the ingestion pipeline. /// The logger factory for creating loggers. public IngestionPipeline( - IngestionDocumentReader reader, IngestionChunker chunker, IngestionChunkWriter writer, IngestionPipelineOptions? options = default, ILoggerFactory? loggerFactory = default) { - _reader = Throw.IfNull(reader); _chunker = Throw.IfNull(chunker); _writer = Throw.IfNull(writer); _activitySource = new((options ?? new()).ActivitySourceName); @@ -69,17 +65,58 @@ public void Dispose() /// public IList> ChunkProcessors { get; } = []; + /// + /// Processes the specified documents. + /// + /// The documents to process. + /// The cancellation token for the operation. + /// An async enumerable of ingestion results. + public async IAsyncEnumerable ProcessAsync(IAsyncEnumerable documents, [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + Throw.IfNull(documents); + + using (Activity? rootActivity = _activitySource.StartActivity(ProcessDocuments.ActivityName)) + { + await foreach (IngestionDocument document in documents.WithCancellation(cancellationToken).ConfigureAwait(false)) + { + using (Activity? processDocumentActivity = _activitySource.StartActivity(ProcessDocument.ActivityName, ActivityKind.Internal, parentContext: rootActivity?.Context ?? default)) + { + processDocumentActivity?.SetTag(ProcessSource.DocumentIdTagName, document.Identifier); + _logger?.ReadDocument(document.Identifier); + + IngestionDocument? processed = null; + Exception? failure = null; + try + { + processed = await IngestAsync(document, processDocumentActivity, cancellationToken).ConfigureAwait(false); + } + catch (Exception ex) + { + TraceException(processDocumentActivity, ex); + _logger?.IngestingFailed(ex, document.Identifier); + + failure = ex; + } + + yield return new IngestionResult(document.Identifier, processed, failure); + } + } + } + } + /// /// Processes all files in the specified directory that match the given search pattern and option. /// + /// The reader to use for reading documents from files. /// The directory to process. /// The search pattern for file selection. /// The search option for directory traversal. /// The cancellation token for the operation. /// A task representing the asynchronous operation. - public async IAsyncEnumerable ProcessAsync(DirectoryInfo directory, string searchPattern = "*.*", + public async IAsyncEnumerable ProcessAsync(IngestionDocumentReader reader, DirectoryInfo directory, string searchPattern = "*.*", SearchOption searchOption = SearchOption.TopDirectoryOnly, [EnumeratorCancellation] CancellationToken cancellationToken = default) { + Throw.IfNull(reader); Throw.IfNull(directory); Throw.IfNullOrEmpty(searchPattern); Throw.IfOutOfRange((int)searchOption, (int)SearchOption.TopDirectoryOnly, (int)SearchOption.AllDirectories); @@ -91,7 +128,7 @@ public async IAsyncEnumerable ProcessAsync(DirectoryInfo direct .SetTag(ProcessDirectory.SearchOptionTagName, searchOption.ToString()); _logger?.ProcessingDirectory(directory.FullName, searchPattern, searchOption); - await foreach (var ingestionResult in ProcessAsync(directory.EnumerateFiles(searchPattern, searchOption), rootActivity, cancellationToken).ConfigureAwait(false)) + await foreach (IngestionResult ingestionResult in ProcessFilesAsync(reader, directory.EnumerateFiles(searchPattern, searchOption), rootActivity, cancellationToken).ConfigureAwait(false)) { yield return ingestionResult; } @@ -101,16 +138,18 @@ public async IAsyncEnumerable ProcessAsync(DirectoryInfo direct /// /// Processes the specified files. /// + /// The reader to use for reading documents from files. /// The collection of files to process. /// The cancellation token for the operation. /// A task representing the asynchronous operation. - public async IAsyncEnumerable ProcessAsync(IEnumerable files, [EnumeratorCancellation] CancellationToken cancellationToken = default) + public async IAsyncEnumerable ProcessAsync(IngestionDocumentReader reader, IEnumerable files, [EnumeratorCancellation] CancellationToken cancellationToken = default) { + Throw.IfNull(reader); Throw.IfNull(files); using (Activity? rootActivity = _activitySource.StartActivity(ProcessFiles.ActivityName)) { - await foreach (var ingestionResult in ProcessAsync(files, rootActivity, cancellationToken).ConfigureAwait(false)) + await foreach (IngestionResult ingestionResult in ProcessFilesAsync(reader, files, rootActivity, cancellationToken).ConfigureAwait(false)) { yield return ingestionResult; } @@ -125,7 +164,7 @@ private static void TraceException(Activity? activity, Exception ex) .SetStatus(ActivityStatusCode.Error, ex.Message); } - private async IAsyncEnumerable ProcessAsync(IEnumerable files, Activity? rootActivity, + private async IAsyncEnumerable ProcessFilesAsync(IngestionDocumentReader reader, IEnumerable files, Activity? rootActivity, [EnumeratorCancellation] CancellationToken cancellationToken) { #if NET @@ -143,13 +182,13 @@ private async IAsyncEnumerable ProcessAsync(IEnumerable IngestAsync(IngestionDocument document, Ac } IAsyncEnumerable> chunks = _chunker.ProcessAsync(document, cancellationToken); - foreach (var processor in ChunkProcessors) + foreach (IngestionChunkProcessor processor in ChunkProcessors) { chunks = processor.ProcessAsync(chunks, cancellationToken); } diff --git a/src/Libraries/Microsoft.Extensions.DataIngestion/README.md b/src/Libraries/Microsoft.Extensions.DataIngestion/README.md index 030ac8da43b..9441a5e464f 100644 --- a/src/Libraries/Microsoft.Extensions.DataIngestion/README.md +++ b/src/Libraries/Microsoft.Extensions.DataIngestion/README.md @@ -104,6 +104,43 @@ VectorStoreCollection> collection = using VectorStoreWriter> writer = new(collection); ``` +## Using the ingestion pipeline + +The `IngestionPipeline` orchestrates document reading, chunking, optional processing, and writing. It can accept documents directly or read them from the file system using an `IngestionDocumentReader`. + +### Processing documents from the file system + +Create a pipeline, then call `ProcessAsync` with an `IngestionDocumentReader` and a directory or list of files: + +```csharp +IngestionDocumentReader reader = new MarkdownReader(); + +using IngestionPipeline pipeline = new(CreateChunker(), CreateWriter()); + +await foreach (IngestionResult result in pipeline.ProcessAsync(reader, new DirectoryInfo("docs"), "*.md")) +{ + Console.WriteLine($"Processed '{result.DocumentId}'. Succeeded: {result.Succeeded}"); +} +``` + +### Processing documents without a reader + +You can also supply `IngestionDocument` instances directly, without any file-system dependency: + +```csharp +using IngestionPipeline pipeline = new(CreateChunker(), CreateWriter()); + +IngestionDocument document = new("my-doc-id"); +document.Sections.Add(new IngestionDocumentSection()); +document.Sections[0].Elements.Add(new IngestionDocumentHeader("# Hello")); +document.Sections[0].Elements.Add(new IngestionDocumentParagraph("This content was created in memory.")); + +await foreach (IngestionResult result in pipeline.ProcessAsync(new[] { document }.ToAsyncEnumerable())) +{ + Console.WriteLine($"Processed '{result.DocumentId}'. Succeeded: {result.Succeeded}"); +} +``` + ## Feedback & Contributing We welcome feedback and contributions in [our GitHub repo](https://github.com/dotnet/extensions). diff --git a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/templates/AIChatWeb-CSharp/AIChatWeb-CSharp.Web/Services/Ingestion/DataIngestor.cs b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/templates/AIChatWeb-CSharp/AIChatWeb-CSharp.Web/Services/Ingestion/DataIngestor.cs index 76168b6e632..ef30bbc415c 100644 --- a/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/templates/AIChatWeb-CSharp/AIChatWeb-CSharp.Web/Services/Ingestion/DataIngestor.cs +++ b/src/ProjectTemplates/Microsoft.Extensions.AI.Templates/templates/AIChatWeb-CSharp/AIChatWeb-CSharp.Web/Services/Ingestion/DataIngestor.cs @@ -20,12 +20,11 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) }); using var pipeline = new IngestionPipeline( - reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, loggerFactory: loggerFactory); - await foreach (var result in pipeline.ProcessAsync(directory, searchPattern)) + await foreach (var result in pipeline.ProcessAsync(new DocumentReader(directory), directory, searchPattern)) { logger.LogInformation("Completed processing '{id}'. Succeeded: '{succeeded}'.", result.DocumentId, result.Succeeded); } diff --git a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs index e865ff39d9b..bee1c2bf6f5 100644 --- a/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs +++ b/test/Libraries/Microsoft.Extensions.DataIngestion.Tests/IngestionPipelineTests.cs @@ -90,8 +90,8 @@ public async Task CanProcessDocuments() "chunks", TestEmbeddingGenerator.DimensionCount); using VectorStoreWriter> vectorStoreWriter = new(collection); - using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter); - List ingestionResults = await pipeline.ProcessAsync(_sampleFiles).ToListAsync(); + using IngestionPipeline pipeline = new(CreateChunker(), vectorStoreWriter); + List ingestionResults = await pipeline.ProcessAsync(CreateReader(), _sampleFiles).ToListAsync(); Assert.Equal(_sampleFiles.Count, ingestionResults.Count); AssertAllIngestionsSucceeded(ingestionResults); @@ -126,10 +126,10 @@ public async Task CanProcessDocumentsInDirectory() "chunks-dir", TestEmbeddingGenerator.DimensionCount); using VectorStoreWriter> vectorStoreWriter = new(collection); - using IngestionPipeline pipeline = new(CreateReader(), CreateChunker(), vectorStoreWriter); + using IngestionPipeline pipeline = new(CreateChunker(), vectorStoreWriter); DirectoryInfo directory = new("TestFiles"); - List ingestionResults = await pipeline.ProcessAsync(directory, "*.md").ToListAsync(); + List ingestionResults = await pipeline.ProcessAsync(CreateReader(), directory, "*.md").ToListAsync(); Assert.Equal(directory.EnumerateFiles("*.md").Count(), ingestionResults.Count); AssertAllIngestionsSucceeded(ingestionResults); @@ -162,10 +162,10 @@ public async Task ChunksCanBeMoreThanJustText() var collection = testVectorStore.GetIngestionRecordCollection, DataContent>( "chunks-img", TestEmbeddingGenerator.DimensionCount); using VectorStoreWriter> vectorStoreWriter = new(collection); - using IngestionPipeline pipeline = new(CreateReader(), new ImageChunker(), vectorStoreWriter); + using IngestionPipeline pipeline = new(new ImageChunker(), vectorStoreWriter); Assert.False(embeddingGenerator.WasCalled); - var ingestionResults = await pipeline.ProcessAsync(_sampleFiles).ToListAsync(); + List ingestionResults = await pipeline.ProcessAsync(CreateReader(), _sampleFiles).ToListAsync(); AssertAllIngestionsSucceeded(ingestionResults); var retrieved = await vectorStoreWriter.VectorStoreCollection @@ -215,10 +215,10 @@ public async Task SingleFailureDoesNotTearDownEntirePipeline() "chunks-fail", TestEmbeddingGenerator.DimensionCount); using VectorStoreWriter> vectorStoreWriter = new(collection); - using IngestionPipeline pipeline = new(failingForFirstReader, CreateChunker(), vectorStoreWriter); + using IngestionPipeline pipeline = new(CreateChunker(), vectorStoreWriter); - await Verify(pipeline.ProcessAsync(_sampleFiles)); - await Verify(pipeline.ProcessAsync(_sampleDirectory)); + await Verify(pipeline.ProcessAsync(failingForFirstReader, _sampleFiles)); + await Verify(pipeline.ProcessAsync(failingForFirstReader, _sampleDirectory)); async Task Verify(IAsyncEnumerable results) { @@ -235,6 +235,35 @@ async Task Verify(IAsyncEnumerable results) } } + [Fact] + public async Task CanProcessDocumentsWithoutReader() + { + TestEmbeddingGenerator embeddingGenerator = new(); + using InMemoryVectorStore testVectorStore = new(new() { EmbeddingGenerator = embeddingGenerator }); + + VectorStoreCollection> collection = + testVectorStore.GetIngestionRecordCollection, string>( + "chunks-direct", TestEmbeddingGenerator.DimensionCount); + using VectorStoreWriter> vectorStoreWriter = new(collection); + + using IngestionPipeline pipeline = new(CreateChunker(), vectorStoreWriter); + + IngestionDocument document = new("doc-1"); + IngestionDocumentSection section = new(); + section.Elements.Add(new IngestionDocumentHeader("# Hello")); + section.Elements.Add(new IngestionDocumentParagraph("This is a test document created without a reader.")); + document.Sections.Add(section); + + List ingestionResults = await pipeline.ProcessAsync(new[] { document }.ToAsyncEnumerable()).ToListAsync(); + + Assert.Single(ingestionResults); + IngestionResult result = ingestionResults[0]; + Assert.Equal("doc-1", result.DocumentId); + Assert.True(result.Succeeded); + Assert.NotNull(result.Document); + Assert.True(embeddingGenerator.WasCalled, "Embedding generator should have been called."); + } + private static IngestionDocumentReader CreateReader() => new MarkdownReader(); private static IngestionChunker CreateChunker() => new HeaderChunker(new(TiktokenTokenizer.CreateForModel("gpt-4"))); diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.A.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.A.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs index 61088b1225d..0d2bd9a9aea 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.A.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.A.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs @@ -20,12 +20,11 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) }); using var pipeline = new IngestionPipeline( - reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, loggerFactory: loggerFactory); - await foreach (var result in pipeline.ProcessAsync(directory, searchPattern)) + await foreach (var result in pipeline.ProcessAsync(new DocumentReader(directory), directory, searchPattern)) { logger.LogInformation("Completed processing '{id}'. Succeeded: '{succeeded}'.", result.DocumentId, result.Succeeded); } diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.A_aoai_aais.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.A_aoai_aais.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs index 61088b1225d..0d2bd9a9aea 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.A_aoai_aais.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.A_aoai_aais.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs @@ -20,12 +20,11 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) }); using var pipeline = new IngestionPipeline( - reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, loggerFactory: loggerFactory); - await foreach (var result in pipeline.ProcessAsync(directory, searchPattern)) + await foreach (var result in pipeline.ProcessAsync(new DocumentReader(directory), directory, searchPattern)) { logger.LogInformation("Completed processing '{id}'. Succeeded: '{succeeded}'.", result.DocumentId, result.Succeeded); } diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb._defaults.verified/aichatweb/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb._defaults.verified/aichatweb/Services/Ingestion/DataIngestor.cs index b4675927d47..4268815bdfe 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb._defaults.verified/aichatweb/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb._defaults.verified/aichatweb/Services/Ingestion/DataIngestor.cs @@ -20,12 +20,11 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) }); using var pipeline = new IngestionPipeline( - reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, loggerFactory: loggerFactory); - await foreach (var result in pipeline.ProcessAsync(directory, searchPattern)) + await foreach (var result in pipeline.ProcessAsync(new DocumentReader(directory), directory, searchPattern)) { logger.LogInformation("Completed processing '{id}'. Succeeded: '{succeeded}'.", result.DocumentId, result.Succeeded); } diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.o_q.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.o_q.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs index 61088b1225d..0d2bd9a9aea 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.o_q.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.o_q.verified/aichatweb/aichatweb.Web/Services/Ingestion/DataIngestor.cs @@ -20,12 +20,11 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) }); using var pipeline = new IngestionPipeline( - reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, loggerFactory: loggerFactory); - await foreach (var result in pipeline.ProcessAsync(directory, searchPattern)) + await foreach (var result in pipeline.ProcessAsync(new DocumentReader(directory), directory, searchPattern)) { logger.LogInformation("Completed processing '{id}'. Succeeded: '{succeeded}'.", result.DocumentId, result.Succeeded); } diff --git a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.oai_aais.verified/aichatweb/Services/Ingestion/DataIngestor.cs b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.oai_aais.verified/aichatweb/Services/Ingestion/DataIngestor.cs index b4675927d47..4268815bdfe 100644 --- a/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.oai_aais.verified/aichatweb/Services/Ingestion/DataIngestor.cs +++ b/test/ProjectTemplates/Microsoft.Extensions.AI.Templates.IntegrationTests/Snapshots/aichatweb/aichatweb.oai_aais.verified/aichatweb/Services/Ingestion/DataIngestor.cs @@ -20,12 +20,11 @@ public async Task IngestDataAsync(DirectoryInfo directory, string searchPattern) }); using var pipeline = new IngestionPipeline( - reader: new DocumentReader(directory), chunker: new SemanticSimilarityChunker(embeddingGenerator, new(TiktokenTokenizer.CreateForModel("gpt-4o"))), writer: writer, loggerFactory: loggerFactory); - await foreach (var result in pipeline.ProcessAsync(directory, searchPattern)) + await foreach (var result in pipeline.ProcessAsync(new DocumentReader(directory), directory, searchPattern)) { logger.LogInformation("Completed processing '{id}'. Succeeded: '{succeeded}'.", result.DocumentId, result.Succeeded); }