From eb65d066fbd9fe1e0fbe98d04cfe6a569c5197ef Mon Sep 17 00:00:00 2001 From: Xiaoyun Zhang Date: Wed, 1 Nov 2023 11:21:44 -0700 Subject: [PATCH] Use AzureEmbedFunctionService from EmbedFunction in PrepDoc (#211) ## Purpose * ... ## Does this introduce a breaking change? ``` [ ] Yes [ ] No ``` ## Pull Request Type What kind of change does this Pull Request introduce? ``` [ ] Bugfix [ ] Feature [ ] Code style update (formatting, local variables) [ ] Refactoring (no functional changes, no api changes) [ ] Documentation content changes [ ] Other... Please describe: ``` ## How to Test * Get the code ``` git clone [repo-address] cd [repo-name] git checkout [branch-name] npm install ``` * Test the code ``` ``` ## What to Check Verify that the following are valid * ... ## Other Information --- .../EmbedFunctions/EmbedFunctions.csproj | 1 + .../EmbedFunctions/EmbeddingFunction.cs | 5 +- app/functions/EmbedFunctions/Program.cs | 25 +- .../Services/AzureSearchEmbedService.cs | 126 ++++-- .../Services/EmbeddingAggregateService.cs | 5 +- app/prepdocs/PrepareDocs/PageDetail.cs | 6 - app/prepdocs/PrepareDocs/PrepareDocs.csproj | 4 + app/prepdocs/PrepareDocs/Program.Clients.cs | 18 + app/prepdocs/PrepareDocs/Program.cs | 420 +----------------- app/prepdocs/PrepareDocs/Section.cs | 8 - 10 files changed, 138 insertions(+), 480 deletions(-) delete mode 100644 app/prepdocs/PrepareDocs/PageDetail.cs delete mode 100644 app/prepdocs/PrepareDocs/Section.cs diff --git a/app/functions/EmbedFunctions/EmbedFunctions.csproj b/app/functions/EmbedFunctions/EmbedFunctions.csproj index 56ed8ab8..0cefdf0e 100644 --- a/app/functions/EmbedFunctions/EmbedFunctions.csproj +++ b/app/functions/EmbedFunctions/EmbedFunctions.csproj @@ -12,6 +12,7 @@ + diff --git a/app/functions/EmbedFunctions/EmbeddingFunction.cs b/app/functions/EmbedFunctions/EmbeddingFunction.cs index ed1ecf7f..cf53bc55 100644 --- a/app/functions/EmbedFunctions/EmbeddingFunction.cs +++ b/app/functions/EmbedFunctions/EmbeddingFunction.cs @@ -12,7 +12,6 @@ public sealed class EmbeddingFunction( public Task EmbedAsync( [BlobTrigger( blobPath: "content/{name}", - Connection = "AzureStorageAccountEndpoint")] Stream blobStream, - string name, - BlobClient client) => embeddingAggregateService.EmbedBlobAsync(client, blobStream, blobName: name); + Connection = "AzureWebJobsStorage")] Stream blobStream, + string name) => embeddingAggregateService.EmbedBlobAsync(blobStream, blobName: name); } diff --git a/app/functions/EmbedFunctions/Program.cs b/app/functions/EmbedFunctions/Program.cs index ed530285..b3c777d1 100644 --- a/app/functions/EmbedFunctions/Program.cs +++ b/app/functions/EmbedFunctions/Program.cs @@ -1,5 +1,8 @@ // Copyright (c) Microsoft. All rights reserved. +using Azure.AI.OpenAI; +using Microsoft.Extensions.DependencyInjection; + var host = new HostBuilder() .ConfigureServices(services => { @@ -36,7 +39,7 @@ uri is not null services.AddSingleton(_ => { var blobServiceClient = new BlobServiceClient( - GetUriFromEnvironment("AZURE_STORAGE_ACCOUNT_ENDPOINT"), + GetUriFromEnvironment("AZURE_STORAGE_BLOB_ENDPOINT"), credential); return blobServiceClient.GetBlobContainerClient("corpus"); @@ -45,10 +48,22 @@ uri is not null services.AddSingleton(); services.AddSingleton(); - services.AddSingleton(); - services.AddSingleton(); - services.AddSingleton(); - services.AddSingleton(); + services.AddSingleton(provider => + { + var searchIndexName = Environment.GetEnvironmentVariable("AZURE_SEARCH_INDEX") ?? throw new ArgumentNullException("AZURE_SEARCH_INDEX is null"); + var embeddingModelName = Environment.GetEnvironmentVariable("AZURE_OPENAI_EMBEDDING_DEPLOYMENT") ?? throw new ArgumentNullException("AZURE_OPENAI_EMBEDDING_DEPLOYMENT is null"); + var openaiEndPoint = Environment.GetEnvironmentVariable("AZURE_OPENAI_ENDPOINT") ?? throw new ArgumentNullException("AZURE_OPENAI_ENDPOINT is null"); + + var openAIClient = new OpenAIClient(new Uri(openaiEndPoint), new DefaultAzureCredential()); + + var searchClient = provider.GetRequiredService(); + var searchIndexClient = provider.GetRequiredService(); + var blobContainerClient = provider.GetRequiredService(); + var documentClient = provider.GetRequiredService(); + var logger = provider.GetRequiredService>(); + + return new AzureSearchEmbedService(openAIClient, embeddingModelName, searchClient, searchIndexName, searchIndexClient, documentClient, blobContainerClient, logger); + }); }) .ConfigureFunctionsWorkerDefaults() .Build(); diff --git a/app/functions/EmbedFunctions/Services/AzureSearchEmbedService.cs b/app/functions/EmbedFunctions/Services/AzureSearchEmbedService.cs index aaa19167..d9ae4169 100644 --- a/app/functions/EmbedFunctions/Services/AzureSearchEmbedService.cs +++ b/app/functions/EmbedFunctions/Services/AzureSearchEmbedService.cs @@ -1,13 +1,20 @@ // Copyright (c) Microsoft. All rights reserved. +using Azure.AI.OpenAI; +using Google.Protobuf.WellKnownTypes; +using Microsoft.Extensions.Options; + namespace EmbedFunctions.Services; -internal sealed partial class AzureSearchEmbedService( +public sealed partial class AzureSearchEmbedService( + OpenAIClient openAIClient, + string embeddingModelName, SearchClient indexSectionClient, + string searchIndexName, SearchIndexClient searchIndexClient, DocumentAnalysisClient documentAnalysisClient, BlobContainerClient corpusContainerClient, - ILogger logger) : IEmbedService + ILogger? logger) : IEmbedService { [GeneratedRegex("[^0-9a-zA-Z_-]")] private static partial Regex MatchInSetRegex(); @@ -16,9 +23,6 @@ public async Task EmbedBlobAsync(Stream blobStream, string blobName) { try { - var searchIndexName = Environment.GetEnvironmentVariable( - "AZURE_SEARCH_INDEX") ?? "gptkbindex"; - await EnsureSearchIndexAsync(searchIndexName); var pageMap = await GetDocumentTextAsync(blobStream, blobName); @@ -41,67 +45,94 @@ public async Task EmbedBlobAsync(Stream blobStream, string blobName) } catch (Exception exception) { - logger.LogError( + logger?.LogError( exception, "Failed to embed blob '{BlobName}'", blobName); return false; } } - private async Task EnsureSearchIndexAsync(string searchIndexName) + public async Task CreateSearchIndexAsync(string searchIndexName) { - var indexNames = searchIndexClient.GetIndexNamesAsync(); - await foreach (var page in indexNames.AsPages()) + string vectorSearchConfigName = "my-vector-config"; + string vectorSearchProfile = "my-vector-profile"; + var index = new SearchIndex(searchIndexName) { - if (page.Values.Any(indexName => indexName == searchIndexName)) + VectorSearch = new() { - logger.LogWarning( - "Search index '{SearchIndexName}' already exists", searchIndexName); - return; + Algorithms = + { + new HnswVectorSearchAlgorithmConfiguration(vectorSearchConfigName) + }, + Profiles = + { + new VectorSearchProfile(vectorSearchProfile, vectorSearchConfigName) } - } - - var index = new SearchIndex(searchIndexName) - { + }, Fields = + { + new SimpleField("id", SearchFieldDataType.String) { IsKey = true }, + new SearchableField("content") { AnalyzerName = LexicalAnalyzerName.EnMicrosoft }, + new SimpleField("category", SearchFieldDataType.String) { IsFacetable = true }, + new SimpleField("sourcepage", SearchFieldDataType.String) { IsFacetable = true }, + new SimpleField("sourcefile", SearchFieldDataType.String) { IsFacetable = true }, + new SearchField("embedding", SearchFieldDataType.Collection(SearchFieldDataType.Single)) { - new SimpleField("id", SearchFieldDataType.String) { IsKey = true }, - new SearchableField("content") { AnalyzerName = "en.microsoft" }, - new SimpleField("category", SearchFieldDataType.String) { IsFacetable = true }, - new SimpleField("sourcepage", SearchFieldDataType.String) { IsFacetable = true }, - new SimpleField("sourcefile", SearchFieldDataType.String) { IsFacetable = true } - }, + VectorSearchDimensions = 1536, + IsSearchable = true, + VectorSearchProfile = vectorSearchProfile, + } + }, SemanticSettings = new SemanticSettings { Configurations = + { + new SemanticConfiguration("default", new PrioritizedFields { - new SemanticConfiguration("default", new PrioritizedFields + ContentFields = { - ContentFields = + new SemanticField { - new SemanticField - { - FieldName = "content" - } + FieldName = "content" } - }) } + }) + } } }; - logger.LogInformation( - "Creating '{searchIndexName}' search index", searchIndexName); + logger?.LogInformation( + "Creating '{searchIndexName}' search index", searchIndexName); await searchIndexClient.CreateIndexAsync(index); } + public async Task EnsureSearchIndexAsync(string searchIndexName) + { + var indexNames = searchIndexClient.GetIndexNamesAsync(); + await foreach (var page in indexNames.AsPages()) + { + if (page.Values.Any(indexName => indexName == searchIndexName)) + { + logger?.LogWarning( + "Search index '{SearchIndexName}' already exists", searchIndexName); + return; + } + } + + await CreateSearchIndexAsync(searchIndexName); + } + private async Task> GetDocumentTextAsync(Stream blobStream, string blobName) { - logger.LogInformation( + logger?.LogInformation( "Extracting text from '{Blob}' using Azure Form Recognizer", blobName); + using var ms = new MemoryStream(); + blobStream.CopyTo(ms); + ms.Position = 0; AnalyzeDocumentOperation operation = documentAnalysisClient.AnalyzeDocument( - WaitUntil.Started, "prebuilt-layout", blobStream); + WaitUntil.Started, "prebuilt-layout", ms); var offset = 0; List pageMap = []; @@ -208,7 +239,7 @@ private async Task UploadCorpusAsync(string corpusBlobName, string text) return; } - logger.LogInformation("Uploading corpus '{CorpusBlobName}'", corpusBlobName); + logger?.LogInformation("Uploading corpus '{CorpusBlobName}'", corpusBlobName); await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(text)); await blobClient.UploadAsync(stream, new BlobHttpHeaders @@ -231,7 +262,7 @@ private IEnumerable
CreateSections( var start = 0; var end = length; - logger.LogInformation("Splitting '{BlobName}' into sections", blobName); + logger?.LogInformation("Splitting '{BlobName}' into sections", blobName); while (start + SectionOverlap < length) { @@ -300,9 +331,9 @@ private IEnumerable
CreateSections( // If the section ends with an unclosed table, we need to start the next section with the table. // If table starts inside SentenceSearchLimit, we ignore it, as that will cause an infinite loop for tables longer than MaxSectionLength // If last table starts inside SectionOverlap, keep overlapping - if (logger.IsEnabled(LogLevel.Warning)) + if (logger?.IsEnabled(LogLevel.Warning) is true) { - logger.LogWarning(""" + logger?.LogWarning(""" Section ends with unclosed table, starting next section with the table at page {Offset} offset {Start} table start {LastTableStart} """, @@ -349,10 +380,10 @@ private static string BlobNameFromFilePage(string blobName, int page = 0) => Pat private async Task IndexSectionsAsync(string searchIndexName, IEnumerable
sections, string blobName) { - var infoLoggingEnabled = logger.IsEnabled(LogLevel.Information); - if (infoLoggingEnabled) + var infoLoggingEnabled = logger?.IsEnabled(LogLevel.Information); + if (infoLoggingEnabled is true) { - logger.LogInformation(""" + logger?.LogInformation(""" Indexing sections from '{BlobName}' into search index '{SearchIndexName}' """, blobName, @@ -363,6 +394,8 @@ Indexing sections from '{BlobName}' into search index '{SearchIndexName}' var batch = new IndexDocumentsBatch(); foreach (var section in sections) { + var embeddings = await openAIClient.GetEmbeddingsAsync(embeddingModelName, new Azure.AI.OpenAI.EmbeddingsOptions(section.Content.Replace('\r', ' '))); + var embedding = embeddings.Value.Data.FirstOrDefault()?.Embedding.ToArray() ?? []; batch.Actions.Add(new IndexDocumentsAction( IndexActionType.MergeOrUpload, new SearchDocument @@ -371,7 +404,8 @@ Indexing sections from '{BlobName}' into search index '{SearchIndexName}' ["content"] = section.Content, ["category"] = section.Category, ["sourcepage"] = section.SourcePage, - ["sourcefile"] = section.SourceFile + ["sourcefile"] = section.SourceFile, + ["embedding"] = embedding, })); iteration++; @@ -380,9 +414,9 @@ Indexing sections from '{BlobName}' into search index '{SearchIndexName}' // Every one thousand documents, batch create. IndexDocumentsResult result = await indexSectionClient.IndexDocumentsAsync(batch); int succeeded = result.Results.Count(r => r.Succeeded); - if (infoLoggingEnabled) + if (infoLoggingEnabled is true) { - logger.LogInformation(""" + logger?.LogInformation(""" Indexed {Count} sections, {Succeeded} succeeded """, batch.Actions.Count, @@ -399,9 +433,9 @@ Indexing sections from '{BlobName}' into search index '{SearchIndexName}' var index = new SearchIndex($"index-{batch.Actions.Count}"); IndexDocumentsResult result = await indexSectionClient.IndexDocumentsAsync(batch); int succeeded = result.Results.Count(r => r.Succeeded); - if (logger.IsEnabled(LogLevel.Information)) + if (logger?.IsEnabled(LogLevel.Information) is true) { - logger.LogInformation(""" + logger?.LogInformation(""" Indexed {Count} sections, {Succeeded} succeeded """, batch.Actions.Count, diff --git a/app/functions/EmbedFunctions/Services/EmbeddingAggregateService.cs b/app/functions/EmbedFunctions/Services/EmbeddingAggregateService.cs index bf325732..b946e5cc 100644 --- a/app/functions/EmbedFunctions/Services/EmbeddingAggregateService.cs +++ b/app/functions/EmbedFunctions/Services/EmbeddingAggregateService.cs @@ -1,12 +1,15 @@ // Copyright (c) Microsoft. All rights reserved. +using System.IO; + namespace EmbedFunctions.Services; public sealed class EmbeddingAggregateService( EmbedServiceFactory embedServiceFactory, + BlobContainerClient client, ILogger logger) { - internal async Task EmbedBlobAsync(BlobClient client, Stream blobStream, string blobName) + internal async Task EmbedBlobAsync(Stream blobStream, string blobName) { try { diff --git a/app/prepdocs/PrepareDocs/PageDetail.cs b/app/prepdocs/PrepareDocs/PageDetail.cs deleted file mode 100644 index f872f0eb..00000000 --- a/app/prepdocs/PrepareDocs/PageDetail.cs +++ /dev/null @@ -1,6 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -internal readonly record struct PageDetail( - int Index, - int Offset, - string Text); diff --git a/app/prepdocs/PrepareDocs/PrepareDocs.csproj b/app/prepdocs/PrepareDocs/PrepareDocs.csproj index ad7294fa..4e4f3ef1 100644 --- a/app/prepdocs/PrepareDocs/PrepareDocs.csproj +++ b/app/prepdocs/PrepareDocs/PrepareDocs.csproj @@ -18,5 +18,9 @@ + + + + diff --git a/app/prepdocs/PrepareDocs/Program.Clients.cs b/app/prepdocs/PrepareDocs/Program.Clients.cs index 3e6b8616..2070a2c4 100644 --- a/app/prepdocs/PrepareDocs/Program.Clients.cs +++ b/app/prepdocs/PrepareDocs/Program.Clients.cs @@ -1,6 +1,9 @@ // Copyright (c) Microsoft. All rights reserved. +using EmbedFunctions.Services; +using Microsoft.Extensions.Logging; + internal static partial class Program { private static BlobContainerClient? s_corpusContainerClient; @@ -16,6 +19,21 @@ internal static partial class Program private static readonly SemaphoreSlim s_searchIndexLock = new(1); private static readonly SemaphoreSlim s_searchLock = new(1); private static readonly SemaphoreSlim s_openAILock = new(1); + private static readonly SemaphoreSlim s_embeddingLock = new(1); + + private static Task GetAzureSearchEmbedService(AppOptions options) => + GetLazyClientAsync(options, s_embeddingLock, async o => + { + var searchIndexClient = await GetSearchIndexClientAsync(o); + var searchClient = await GetSearchClientAsync(o); + var documentClient = await GetFormRecognizerClientAsync(o); + var blobContainerClient = await GetBlobContainerClientAsync(o); + var openAIClient = await GetAzureOpenAIClientAsync(o); + var embeddingModelName = o.EmbeddingModelName ?? throw new ArgumentNullException(nameof(o.EmbeddingModelName)); + var searchIndexName = o.SearchIndexName ?? throw new ArgumentNullException(nameof(o.SearchIndexName)); + + return new AzureSearchEmbedService(openAIClient, embeddingModelName, searchClient, searchIndexName, searchIndexClient, documentClient, blobContainerClient, null); + }); private static Task GetCorpusBlobContainerClientAsync(AppOptions options) => GetLazyClientAsync(options, s_corpusContainerLock, static async o => diff --git a/app/prepdocs/PrepareDocs/Program.cs b/app/prepdocs/PrepareDocs/Program.cs index 8d6f849c..794acd50 100644 --- a/app/prepdocs/PrepareDocs/Program.cs +++ b/app/prepdocs/PrepareDocs/Program.cs @@ -1,6 +1,6 @@ // Copyright (c) Microsoft. All rights reserved. -using Azure.Search.Documents.Indexes.Models; +using EmbedFunctions.Services; s_rootCommand.SetHandler( async (context) => @@ -13,7 +13,10 @@ } else { - await CreateSearchIndexAsync(options); + var searchIndexName = options.SearchIndexName ?? throw new ArgumentNullException(nameof(options.SearchIndexName)); + var embedService = await GetAzureSearchEmbedService(options); + + await embedService.EnsureSearchIndexAsync(options.SearchIndexName); Matcher matcher = new(); matcher.AddInclude(options.Files); @@ -32,12 +35,12 @@ .Select(i => { var fileName = files[i]; - return ProcessSingleFileAsync(options, fileName); + return ProcessSingleFileAsync(options, fileName, embedService); }); await Task.WhenAll(tasks); - static async Task ProcessSingleFileAsync(AppOptions options, string fileName) + static async Task ProcessSingleFileAsync(AppOptions options, string fileName, IEmbedService embedService) { if (options.Verbose) { @@ -57,19 +60,10 @@ static async Task ProcessSingleFileAsync(AppOptions options, string fileName) } await UploadBlobsAsync(options, fileName); - var fileNameWithoutExtension = Path.GetFileNameWithoutExtension(fileName); - var pageMap = await GetDocumentTextAsync(options, fileName); - - // Create corpus from page map and upload to blob - // Corpus name format: fileName-{page}.txt - foreach (var page in pageMap) + using (var stream = File.OpenRead(fileName)) { - var corpusName = $"{fileNameWithoutExtension}-{page.Index}.txt"; - await UploadCorpusAsync(options, corpusName, page.Text); + await embedService.EmbedBlobAsync(stream, fileName); } - - var sections = CreateSections(options, pageMap, fileName); - await IndexSectionsAsync(options, sections, fileName); } } }); @@ -165,99 +159,6 @@ Removing sections from '{fileName ?? "all"}' from search index '{options.SearchI } } -static async ValueTask CreateSearchIndexAsync(AppOptions options) -{ - var indexClient = await GetSearchIndexClientAsync(options); - - var indexNames = indexClient.GetIndexNamesAsync(); - await foreach (var page in indexNames.AsPages()) - { - if (page.Values.Any(indexName => indexName == options.SearchIndexName)) - { - if (options.Verbose) - { - options.Console.WriteLine($"Search index '{options.SearchIndexName}' already exists"); - } - return; - } - } - - string vectorSearchConfigName = "my-vector-config"; - string vectorSearchProfile = "my-vector-profile"; - var index = new SearchIndex(options.SearchIndexName) - { - VectorSearch = new() - { - Algorithms = - { - new HnswVectorSearchAlgorithmConfiguration(vectorSearchConfigName) - }, - Profiles = - { - new VectorSearchProfile(vectorSearchProfile, vectorSearchConfigName) - } - }, - Fields = - { - new SimpleField("id", SearchFieldDataType.String) { IsKey = true }, - new SearchableField("content") { AnalyzerName = LexicalAnalyzerName.EnMicrosoft }, - new SimpleField("category", SearchFieldDataType.String) { IsFacetable = true }, - new SimpleField("sourcepage", SearchFieldDataType.String) { IsFacetable = true }, - new SimpleField("sourcefile", SearchFieldDataType.String) { IsFacetable = true }, - new SearchField("embedding", SearchFieldDataType.Collection(SearchFieldDataType.Single)) - { - VectorSearchDimensions = 1536, - IsSearchable = true, - VectorSearchProfile = vectorSearchProfile, - } - }, - SemanticSettings = new SemanticSettings - { - Configurations = - { - new SemanticConfiguration("default", new PrioritizedFields - { - ContentFields = - { - new SemanticField - { - FieldName = "content" - } - } - }) - } - } - }; - - if (options.Verbose) - { - options.Console.WriteLine($"Creating '{options.SearchIndexName}' search index"); - } - - await indexClient.CreateIndexAsync(index); -} - -static async ValueTask UploadCorpusAsync( - AppOptions options, string corpusName, string content) -{ - var container = await GetCorpusBlobContainerClientAsync(options); - var blobClient = container.GetBlobClient(corpusName); - if (await blobClient.ExistsAsync()) - { - return; - } - if (options.Verbose) - { - options.Console.WriteLine($"Uploading corpus '{corpusName}'"); - } - - await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(content)); - await blobClient.UploadAsync(stream, new BlobHttpHeaders - { - ContentType = "text/plain" - }); -} - static async ValueTask UploadBlobsAsync( AppOptions options, string fileName) { @@ -332,313 +233,10 @@ static string GetContentType(string fileName) }; } -static async ValueTask> GetDocumentTextAsync( - AppOptions options, string filename) -{ - if (options.Verbose) - { - options.Console.WriteLine($"Extracting text from '{filename}' using Azure Form Recognizer"); - } - - await using FileStream stream = File.OpenRead(filename); - - var client = await GetFormRecognizerClientAsync(options); - AnalyzeDocumentOperation operation = client.AnalyzeDocument( - WaitUntil.Started, "prebuilt-layout", stream); - - var offset = 0; - List pageMap = []; - - var results = await operation.WaitForCompletionAsync(); - var pages = results.Value.Pages; - for (var i = 0; i < pages.Count; i++) - { - IReadOnlyList tablesOnPage = - results.Value.Tables.Where(t => t.BoundingRegions[0].PageNumber == i + 1).ToList(); - - // Mark all positions of the table spans in the page - int pageIndex = pages[i].Spans[0].Index; - int pageLength = pages[i].Spans[0].Length; - int[] tableChars = Enumerable.Repeat(-1, pageLength).ToArray(); - for (var tableId = 0; tableId < tablesOnPage.Count; tableId++) - { - foreach (DocumentSpan span in tablesOnPage[tableId].Spans) - { - // Replace all table spans with "tableId" in tableChars array - for (var j = 0; j < span.Length; j++) - { - int index = span.Index - pageIndex + j; - if (index >= 0 && index < pageLength) - { - tableChars[index] = tableId; - } - } - } - } - - // Build page text by replacing characters in table spans with table HTML - StringBuilder pageText = new(); - HashSet addedTables = []; - for (int j = 0; j < tableChars.Length; j++) - { - if (tableChars[j] == -1) - { - pageText.Append(results.Value.Content[pageIndex + j]); - } - else if (!addedTables.Contains(tableChars[j])) - { - pageText.Append(TableToHtml(tablesOnPage[tableChars[j]])); - addedTables.Add(tableChars[j]); - } - } - - pageText.Append(' '); - pageMap.Add(new PageDetail(i, offset, pageText.ToString())); - offset += pageText.Length; - } - - return pageMap.AsReadOnly(); -} - -static IEnumerable
CreateSections( - AppOptions options, IReadOnlyList pageMap, string fileName) -{ - const int MaxSectionLength = 1_000; - const int SentenceSearchLimit = 100; - const int SectionOverlap = 100; - - char[] sentenceEndings = ['.', '!', '?']; - char[] wordBreaks = [',', ';', ':', ' ', '(', ')', '[', ']', '{', '}', '\t', '\n']; - var allText = string.Concat(pageMap.Select(p => p.Text)); - var length = allText.Length; - var start = 0; - var end = length; - - if (options.Verbose) - { - options.Console.WriteLine($"Splitting '{fileName}' into sections"); - } - - while (start + SectionOverlap < length) - { - var lastWord = -1; - end = start + MaxSectionLength; - - if (end > length) - { - end = length; - } - else - { - // Try to find the end of the sentence - while (end < length && (end - start - MaxSectionLength) < SentenceSearchLimit && !sentenceEndings.Contains(allText[end])) - { - if (wordBreaks.Contains(allText[end])) - { - lastWord = end; - } - end++; - } - - if (end < length && !sentenceEndings.Contains(allText[end]) && lastWord > 0) - { - end = lastWord; // Fall back to at least keeping a whole word - } - } - - if (end < length) - { - end++; - } - - // Try to find the start of the sentence or at least a whole word boundary - lastWord = -1; - while (start > 0 && start > end - MaxSectionLength - - (2 * SentenceSearchLimit) && !sentenceEndings.Contains(allText[start])) - { - if (wordBreaks.Contains(allText[start])) - { - lastWord = start; - } - start--; - } - - if (!sentenceEndings.Contains(allText[start]) && lastWord > 0) - { - start = lastWord; - } - if (start > 0) - { - start++; - } - - var sectionText = allText[start..end]; - - yield return new Section( - Id: MatchInSetRegex().Replace($"{fileName}-{start}", "_").TrimStart('_'), - Content: sectionText, - SourcePage: BlobNameFromFilePage(fileName, FindPage(pageMap, start)), - SourceFile: fileName, - Category: options.Category); - - var lastTableStart = sectionText.LastIndexOf(" 2 * SentenceSearchLimit && lastTableStart > sectionText.LastIndexOf(" sections, - string fileName) -{ - if (options.Verbose) - { - options.Console.WriteLine($""" - Indexing sections from '{fileName}' into search index '{options.SearchIndexName}' - """); - } - - var searchClient = await GetSearchClientAsync(options); - var openAIClient = await GetAzureOpenAIClientAsync(options); - - var iteration = 0; - var batch = new IndexDocumentsBatch(); - foreach (var section in sections) - { - var embeddings = await openAIClient.GetEmbeddingsAsync(options.EmbeddingModelName, new Azure.AI.OpenAI.EmbeddingsOptions(section.Content.Replace('\r', ' '))); - var embedding = embeddings.Value.Data.FirstOrDefault()?.Embedding.ToArray() ?? []; - batch.Actions.Add(new IndexDocumentsAction( - IndexActionType.MergeOrUpload, - new SearchDocument - { - ["id"] = section.Id, - ["content"] = section.Content, - ["category"] = section.Category, - ["sourcepage"] = section.SourcePage, - ["sourcefile"] = section.SourceFile, - ["embedding"] = embedding, - })); - - iteration++; - if (iteration % 1_000 is 0) - { - // Every one thousand documents, batch create. - IndexDocumentsResult result = await searchClient.IndexDocumentsAsync(batch); - int succeeded = result.Results.Count(r => r.Succeeded); - if (options.Verbose) - { - options.Console.WriteLine($""" - Indexed {batch.Actions.Count} sections, {succeeded} succeeded - """); - } - - batch = new(); - } - } - - if (batch is { Actions.Count: > 0 }) - { - // Any remaining documents, batch create. - var index = new SearchIndex($"index-{batch.Actions.Count}"); - IndexDocumentsResult result = await searchClient.IndexDocumentsAsync(batch); - int succeeded = result.Results.Count(r => r.Succeeded); - if (options.Verbose) - { - options.Console.WriteLine($""" - Indexed {batch.Actions.Count} sections, {succeeded} succeeded - """); - } - } -} - static string BlobNameFromFilePage(string filename, int page = 0) => Path.GetExtension(filename).ToLower() is ".pdf" ? $"{Path.GetFileNameWithoutExtension(filename)}-{page}.pdf" : Path.GetFileName(filename); -static string TableToHtml(DocumentTable table) -{ - var tableHtml = new StringBuilder(""); - var rows = new List[table.RowCount]; - for (int i = 0; i < table.RowCount; i++) - { - rows[i] = - [ - .. table.Cells.Where(c => c.RowIndex == i) - .OrderBy(c => c.ColumnIndex) - ]; - } - - foreach (var rowCells in rows) - { - tableHtml.Append(""); - foreach (DocumentTableCell cell in rowCells) - { - var tag = (cell.Kind == "columnHeader" || cell.Kind == "rowHeader") ? "th" : "td"; - var cellSpans = string.Empty; - if (cell.ColumnSpan > 1) - { - cellSpans += $" colSpan='{cell.ColumnSpan}'"; - } - - if (cell.RowSpan > 1) - { - cellSpans += $" rowSpan='{cell.RowSpan}'"; - } - - tableHtml.AppendFormat( - "<{0}{1}>{2}", tag, cellSpans, WebUtility.HtmlEncode(cell.Content)); - } - - tableHtml.Append(""); - } - - tableHtml.Append("
"); - - return tableHtml.ToString(); -} - -static int FindPage(IReadOnlyList pageMap, int offset) -{ - var length = pageMap.Count; - for (var i = 0; i < length - 1; i++) - { - if (offset >= pageMap[i].Offset && offset < pageMap[i + 1].Offset) - { - return i; - } - } - - return length - 1; -} - internal static partial class Program { [GeneratedRegex("[^0-9a-zA-Z_-]")] diff --git a/app/prepdocs/PrepareDocs/Section.cs b/app/prepdocs/PrepareDocs/Section.cs deleted file mode 100644 index 0d009fec..00000000 --- a/app/prepdocs/PrepareDocs/Section.cs +++ /dev/null @@ -1,8 +0,0 @@ -// Copyright (c) Microsoft. All rights reserved. - -internal readonly record struct Section( - string Id, - string Content, - string SourcePage, - string SourceFile, - string? Category);