From b78465dbd033f00f1e989229bac793a9d93a4b93 Mon Sep 17 00:00:00 2001 From: Shannon Date: Wed, 18 Sep 2024 09:57:11 -0600 Subject: [PATCH] Sorts docs by id for, don't cast, less allocations for better perf. Updates benchmarks and adds TODOs --- .../ConcurrentSearchBenchmarks.cs | 215 ++++++++++++++++-- .../Examine.Benchmarks.csproj | 4 +- src/Examine.Benchmarks/Program.cs | 57 +++++ src/Examine.Lucene/Indexing/FullTextType.cs | 1 + .../Indexing/GenericAnalyzerFieldValueType.cs | 1 + src/Examine.Lucene/Indexing/Int32Type.cs | 12 +- src/Examine.Lucene/Indexing/RawStringType.cs | 6 + src/Examine.Lucene/Providers/LuceneIndex.cs | 9 + .../Search/LuceneSearchExecutor.cs | 30 ++- .../Search/LuceneSearchResults.cs | 11 +- 10 files changed, 307 insertions(+), 39 deletions(-) create mode 100644 src/Examine.Benchmarks/Program.cs diff --git a/src/Examine.Benchmarks/ConcurrentSearchBenchmarks.cs b/src/Examine.Benchmarks/ConcurrentSearchBenchmarks.cs index dee842fb9..ed685956b 100644 --- a/src/Examine.Benchmarks/ConcurrentSearchBenchmarks.cs +++ b/src/Examine.Benchmarks/ConcurrentSearchBenchmarks.cs @@ -3,22 +3,25 @@ using System.ComponentModel; using System.IO; using System.Linq; +using System.Runtime.Versioning; using System.Threading; using System.Threading.Tasks; using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Configs; +using BenchmarkDotNet.Jobs; using Examine.Lucene.Search; using Examine.Search; using Examine.Test; using Lucene.Net.Analysis; using Lucene.Net.Analysis.Standard; +using Lucene.Net.Codecs.Lucene46; using Lucene.Net.Index; +using Lucene.Net.Index.Extensions; using Lucene.Net.QueryParsers.Classic; using Lucene.Net.Search; using Lucene.Net.Store; using Lucene.Net.Util; using Microsoft.Extensions.Logging; -using Microsoft.VSDiagnostics; using Directory = Lucene.Net.Store.Directory; [assembly: Config(typeof(MyDefaultConfig))] @@ -151,11 +154,11 @@ After changing to use singleton indexers/managers */ - [ShortRunJob] + [MediumRunJob(RuntimeMoniker.Net80)] [ThreadingDiagnoser] [MemoryDiagnoser] - [DotNetCountersDiagnoser] - [CPUUsageDiagnoser] + //[DotNetCountersDiagnoser] + //[CPUUsageDiagnoser] public class ConcurrentSearchBenchmarks : ExamineBaseTest { private readonly StandardAnalyzer _analyzer = new StandardAnalyzer(LuceneInfo.CurrentVersion); @@ -181,9 +184,15 @@ public override void Setup() var tempIndexer = InitializeAndIndexItems(_tempBasePath, _analyzer, out var indexDir); tempIndexer.Dispose(); _indexDir = FSDirectory.Open(indexDir); - _writer = new IndexWriter(_indexDir, new IndexWriterConfig(LuceneVersion.LUCENE_48, _analyzer)); + var writerConfig = new IndexWriterConfig(LuceneVersion.LUCENE_48, _analyzer); + //writerConfig.SetMaxBufferedDocs(1000); + //writerConfig.SetReaderTermsIndexDivisor(4); + //writerConfig.SetOpenMode(OpenMode.APPEND); + //writerConfig.SetReaderPooling(true); + //writerConfig.SetCodec(new Lucene46Codec()); + _writer = new IndexWriter(_indexDir, writerConfig); var trackingWriter = new TrackingIndexWriter(_writer); - _searcherManager = new SearcherManager(trackingWriter.IndexWriter, applyAllDeletes: false, new SearcherFactory()); + _searcherManager = new SearcherManager(trackingWriter.IndexWriter, applyAllDeletes: true, new SearcherFactory()); } [GlobalCleanup] @@ -199,13 +208,13 @@ public override void TearDown() System.IO.Directory.Delete(_tempBasePath, true); } - [Params(1, 15, 30)] + [Params(/*1, 15, */30)] public int ThreadCount { get; set; } - [Params(10, 100, 1000)] + [Params(10/*, 100, 1000*/)] public int MaxResults { get; set; } - [Benchmark] + [Benchmark(Baseline = true)] public async Task ExamineStandard() { var tasks = new List(); @@ -235,7 +244,7 @@ public async Task ExamineStandard() } [Benchmark] - public async Task LuceneSimple() + public async Task LuceneAcquireAlways() { var tasks = new List(); @@ -256,22 +265,195 @@ public async Task LuceneSimple() var topDocsCollector = TopScoreDocCollector.Create(MaxResults, null, true); searcher.Search(query, topDocsCollector); + var topDocs = topDocsCollector.GetTopDocs(0, MaxResults); var totalItemCount = topDocs.TotalHits; - var results = new List(topDocs.ScoreDocs.Length); - for (var i = 0; i < topDocs.ScoreDocs.Length; i++) + var results = new List(topDocs.ScoreDocs.Length); + + foreach (var scoreDoc in topDocs.ScoreDocs) { - var scoreDoc = topDocs.ScoreDocs[i]; - var docId = scoreDoc.Doc; + var docId = scoreDoc.Doc; + var score = scoreDoc.Score; + var shardIndex = scoreDoc.ShardIndex; var doc = searcher.Doc(docId); + var result = LuceneSearchExecutor.CreateSearchResult(doc, score, shardIndex); + results.Add(result); + } + + var maxScore = topDocs.MaxScore; + + // enumerate (forces the result to execute) + var logOutput = "ThreadID: " + Thread.CurrentThread.ManagedThreadId + ", Results: " + string.Join(',', results.Select(x => $"{x.Id}-{x.Values.Count}-{x.Score}").ToArray()); + _logger.LogDebug(logOutput); + })); + } + + foreach (var task in tasks) + { + task.Start(); + } + + await Task.WhenAll(tasks); + } + + [Benchmark] + public async Task LuceneAcquireAlwaysWithLock() + { + var tasks = new List(); + var myLock = new object(); + + for (var i = 0; i < ThreadCount; i++) + { + tasks.Add(new Task(() => + { + lock (myLock) + { + var parser = new QueryParser(LuceneVersion.LUCENE_48, ExamineFieldNames.ItemIdFieldName, new StandardAnalyzer(LuceneVersion.LUCENE_48)); + var query = parser.Parse($"{ExamineFieldNames.CategoryFieldName}:content AND nodeName:location*"); + + // this is like doing Acquire, does it perform the same (it will allocate more) + using var context = _searcherManager.GetContext(); + + var searcher = context.Reference; + + // Don't use this, increasing the max docs substantially decreases performance + //var maxDoc = searcher.IndexReader.MaxDoc; + var topDocsCollector = TopScoreDocCollector.Create(MaxResults, null, true); + + searcher.Search(query, topDocsCollector); + + var topDocs = topDocsCollector.GetTopDocs(0, MaxResults); + + var totalItemCount = topDocs.TotalHits; + + var results = new List(topDocs.ScoreDocs.Length); + + foreach (var scoreDoc in topDocs.ScoreDocs) + { + var docId = scoreDoc.Doc; + var score = scoreDoc.Score; + var shardIndex = scoreDoc.ShardIndex; + var doc = searcher.Doc(docId); + var result = LuceneSearchExecutor.CreateSearchResult(doc, score, shardIndex); + results.Add(result); + } + + var maxScore = topDocs.MaxScore; + + // enumerate (forces the result to execute) + var logOutput = "ThreadID: " + Thread.CurrentThread.ManagedThreadId + ", Results: " + string.Join(',', results.Select(x => $"{x.Id}-{x.Values.Count}-{x.Score}").ToArray()); + _logger.LogDebug(logOutput); + } + })); + } + + foreach (var task in tasks) + { + task.Start(); + } + + await Task.WhenAll(tasks); + } + + [Benchmark] + public async Task LuceneAcquireOnce() + { + var tasks = new List(); + + var searcher = _searcherManager.Acquire(); + + try + { + for (var i = 0; i < ThreadCount; i++) + { + tasks.Add(new Task(() => + { + var parser = new QueryParser(LuceneVersion.LUCENE_48, ExamineFieldNames.ItemIdFieldName, new StandardAnalyzer(LuceneVersion.LUCENE_48)); + var query = parser.Parse($"{ExamineFieldNames.CategoryFieldName}:content AND nodeName:location*"); + + // Don't use this, increasing the max docs substantially decreases performance + //var maxDoc = searcher.IndexReader.MaxDoc; + var topDocsCollector = TopScoreDocCollector.Create(MaxResults, null, true); + + searcher.Search(query, topDocsCollector); + var topDocs = topDocsCollector.GetTopDocs(0, MaxResults); + + var totalItemCount = topDocs.TotalHits; + + var results = new List(topDocs.ScoreDocs.Length); + for (var i = 0; i < topDocs.ScoreDocs.Length; i++) + { + var scoreDoc = topDocs.ScoreDocs[i]; + var docId = scoreDoc.Doc; + var doc = searcher.Doc(docId); + var score = scoreDoc.Score; + var shardIndex = scoreDoc.ShardIndex; + var result = LuceneSearchExecutor.CreateSearchResult(doc, score, shardIndex); + results.Add(result); + } + + var maxScore = topDocs.MaxScore; + + // enumerate (forces the result to execute) + var logOutput = "ThreadID: " + Thread.CurrentThread.ManagedThreadId + ", Results: " + string.Join(',', results.Select(x => $"{x.Id}-{x.Values.Count}-{x.Score}").ToArray()); + _logger.LogDebug(logOutput); + })); + } + + foreach (var task in tasks) + { + task.Start(); + } + + await Task.WhenAll(tasks); + } + finally + { + _searcherManager.Release(searcher); + } + } + + [Benchmark] + public async Task LuceneSortedDocIds() + { + var tasks = new List(); + + for (var i = 0; i < ThreadCount; i++) + { + tasks.Add(new Task(() => + { + var parser = new QueryParser(LuceneVersion.LUCENE_48, ExamineFieldNames.ItemIdFieldName, new StandardAnalyzer(LuceneVersion.LUCENE_48)); + var query = parser.Parse($"{ExamineFieldNames.CategoryFieldName}:content AND nodeName:location*"); + + // this is like doing Acquire, does it perform the same (it will allocate more) + using var context = _searcherManager.GetContext(); + + var searcher = context.Reference; + + // Don't use this, increasing the max docs substantially decreases performance + //var maxDoc = searcher.IndexReader.MaxDoc; + var topDocsCollector = TopScoreDocCollector.Create(MaxResults, null, true); + + searcher.Search(query, topDocsCollector); + + var topDocs = topDocsCollector.GetTopDocs(0, MaxResults); + + var totalItemCount = topDocs.TotalHits; + + var results = new List(topDocs.ScoreDocs.Length); + + foreach (var scoreDoc in topDocs.ScoreDocs.OrderBy(x => x.Doc)) + { + var docId = scoreDoc.Doc; var score = scoreDoc.Score; var shardIndex = scoreDoc.ShardIndex; + var doc = searcher.Doc(docId); var result = LuceneSearchExecutor.CreateSearchResult(doc, score, shardIndex); results.Add(result); } - var searchAfterOptions = LuceneSearchExecutor.GetSearchAfterOptions(topDocs); + var maxScore = topDocs.MaxScore; // enumerate (forces the result to execute) @@ -288,9 +470,10 @@ public async Task LuceneSimple() await Task.WhenAll(tasks); } +#if RELEASE protected override ILoggerFactory CreateLoggerFactory() => Microsoft.Extensions.Logging.LoggerFactory.Create(builder => builder.AddConsole().SetMinimumLevel(LogLevel.Information)); - +#endif private TestIndex InitializeAndIndexItems( string tempBasePath, Analyzer analyzer, diff --git a/src/Examine.Benchmarks/Examine.Benchmarks.csproj b/src/Examine.Benchmarks/Examine.Benchmarks.csproj index 20c1afb5f..f21527d9e 100644 --- a/src/Examine.Benchmarks/Examine.Benchmarks.csproj +++ b/src/Examine.Benchmarks/Examine.Benchmarks.csproj @@ -6,6 +6,7 @@ enable false false + Exe @@ -17,9 +18,6 @@ - - - diff --git a/src/Examine.Benchmarks/Program.cs b/src/Examine.Benchmarks/Program.cs new file mode 100644 index 000000000..444eeea32 --- /dev/null +++ b/src/Examine.Benchmarks/Program.cs @@ -0,0 +1,57 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using BenchmarkDotNet.Running; +using Microsoft.Diagnostics.Tracing.Parsers.Kernel; + +namespace Examine.Benchmarks +{ + public class Program + { + public static async Task Main(string[] args) + { +#if RELEASE + // Benchmark your function here. + BenchmarkSwitcher.FromAssembly(typeof(Program).Assembly).Run(args); +#else + var bench = new ConcurrentSearchBenchmarks(); + try + { + bench.Setup(); + await Threads100(bench); + await Threads1(bench); + } + finally + { + bench.TearDown(); + } + +#endif + // Call your function here. + } + + private static async Task Threads100(ConcurrentSearchBenchmarks bench) + { + bench.ThreadCount = 50; + bench.MaxResults = 10; + + for (var i = 0; i < 100; i++) + { + await bench.ExamineStandard(); + } + } + + private static async Task Threads1(ConcurrentSearchBenchmarks bench) + { + bench.ThreadCount = 1; + bench.MaxResults = 10; + + for (var i = 0; i < 100; i++) + { + await bench.ExamineStandard(); + } + } + } +} diff --git a/src/Examine.Lucene/Indexing/FullTextType.cs b/src/Examine.Lucene/Indexing/FullTextType.cs index 67d274587..d38229f15 100644 --- a/src/Examine.Lucene/Indexing/FullTextType.cs +++ b/src/Examine.Lucene/Indexing/FullTextType.cs @@ -56,6 +56,7 @@ protected override void AddSingleValue(Document doc, object value) if (_sortable) { //to be sortable it cannot be analyzed so we have to make a different field + // TODO: Investigate https://lucene.apache.org/core/4_3_0/core/org/apache/lucene/document/SortedDocValuesField.html doc.Add(new StringField( ExamineFieldNames.SortedFieldNamePrefix + FieldName, str, diff --git a/src/Examine.Lucene/Indexing/GenericAnalyzerFieldValueType.cs b/src/Examine.Lucene/Indexing/GenericAnalyzerFieldValueType.cs index f5b261d81..d2a5328b6 100644 --- a/src/Examine.Lucene/Indexing/GenericAnalyzerFieldValueType.cs +++ b/src/Examine.Lucene/Indexing/GenericAnalyzerFieldValueType.cs @@ -38,6 +38,7 @@ protected override void AddSingleValue(Document doc, object value) if (_sortable) { //to be sortable it cannot be analyzed so we have to make a different field + // TODO: Investigate https://lucene.apache.org/core/4_3_0/core/org/apache/lucene/document/SortedDocValuesField.html doc.Add(new StringField( ExamineFieldNames.SortedFieldNamePrefix + FieldName, str, diff --git a/src/Examine.Lucene/Indexing/Int32Type.cs b/src/Examine.Lucene/Indexing/Int32Type.cs index f3c42ae74..cedb7991c 100644 --- a/src/Examine.Lucene/Indexing/Int32Type.cs +++ b/src/Examine.Lucene/Indexing/Int32Type.cs @@ -7,9 +7,12 @@ namespace Examine.Lucene.Indexing { public class Int32Type : IndexFieldRangeValueType { + private readonly string _docValuesFieldName; + public Int32Type(string fieldName, ILoggerFactory logger, bool store = true) : base(fieldName, logger, store) { + _docValuesFieldName = "dv_" + fieldName; } /// @@ -22,7 +25,14 @@ protected override void AddSingleValue(Document doc, object value) if (!TryConvert(value, out int parsedVal)) return; - doc.Add(new Int32Field(FieldName,parsedVal, Store ? Field.Store.YES : Field.Store.NO));; + // TODO: We can use this for better scoring/sorting performance + // https://stackoverflow.com/a/44953624/694494 + // https://lucene.apache.org/core/7_4_0/core/org/apache/lucene/document/NumericDocValuesField.html + //var dvField = new NumericDocValuesField(_docValuesFieldName, 0); + //dvField.SetInt32Value(parsedVal); + //doc.Add(dvField); + + doc.Add(new Int32Field(FieldName, parsedVal, Store ? Field.Store.YES : Field.Store.NO)); } public override Query GetQuery(string query) diff --git a/src/Examine.Lucene/Indexing/RawStringType.cs b/src/Examine.Lucene/Indexing/RawStringType.cs index 68ebf412e..448c90cf0 100644 --- a/src/Examine.Lucene/Indexing/RawStringType.cs +++ b/src/Examine.Lucene/Indexing/RawStringType.cs @@ -30,6 +30,12 @@ protected override void AddSingleValue(Document doc, object value) switch (value) { case IIndexableField f: + // https://lucene.apache.org/core/4_3_0/core/org/apache/lucene/index/IndexableField.html + // BinaryDocValuesField, ByteDocValuesField, DerefBytesDocValuesField, DoubleDocValuesField, DoubleField, + // Field, FloatDocValuesField, FloatField, IntDocValuesField, IntField, LongDocValuesField, LongField, + // NumericDocValuesField, PackedLongDocValuesField, ShortDocValuesField, SortedBytesDocValuesField, + // SortedDocValuesField, SortedSetDocValuesField, StoredField, StraightBytesDocValuesField, StringField, TextField + // https://solr.apache.org/guide/6_6/docvalues.html doc.Add(f); break; case TokenStream ts: diff --git a/src/Examine.Lucene/Providers/LuceneIndex.cs b/src/Examine.Lucene/Providers/LuceneIndex.cs index 500fae7b8..08ebc5008 100644 --- a/src/Examine.Lucene/Providers/LuceneIndex.cs +++ b/src/Examine.Lucene/Providers/LuceneIndex.cs @@ -4,6 +4,7 @@ using System.Diagnostics; using System.IO; using System.Linq; +using System.Runtime.Intrinsics.X86; using System.Threading; using System.Threading.Tasks; using Examine.Lucene.Directories; @@ -17,6 +18,7 @@ using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using static Lucene.Net.Index.IndexWriter; +using static Lucene.Net.Store.Lock; using Directory = Lucene.Net.Store.Directory; namespace Examine.Lucene.Providers @@ -1116,6 +1118,13 @@ private bool ProcessIndexQueueItem(IndexOperation op) return false; } + // TODO: We can re-use the same document object to save a lot of GC! + // https://cwiki.apache.org/confluence/display/lucene/ImproveIndexingSpeed + // Re-use Document and Field instances + // As of Lucene 2.3 there are new setValue(...) methods that allow you to change the value of a Field.This allows you to re - use a single Field instance across many added documents, which can save substantial GC cost. + // It's best to create a single Document instance, then add multiple Field instances to it, but hold onto these Field instances and re-use them by changing their values for each added document. For example you might have an idField, bodyField, nameField, storedField1, etc. After the document is added, you then directly change the Field values (idField.setValue(...), etc), and then re-add your Document instance. + // Note that you cannot re - use a single Field instance within a Document, and, you should not change a Field's value until the Document containing that Field has been added to the index. See Field for details. + var d = new Document(); AddDocument(d, indexingNodeDataArgs.ValueSet); diff --git a/src/Examine.Lucene/Search/LuceneSearchExecutor.cs b/src/Examine.Lucene/Search/LuceneSearchExecutor.cs index 8288159f7..7c873cf77 100644 --- a/src/Examine.Lucene/Search/LuceneSearchExecutor.cs +++ b/src/Examine.Lucene/Search/LuceneSearchExecutor.cs @@ -126,13 +126,17 @@ public ISearchResults Execute() var totalItemCount = topDocs.TotalHits; - var results = new List(topDocs.ScoreDocs.Length); - for (int i = 0; i < topDocs.ScoreDocs.Length; i++) + var results = new List(topDocs.ScoreDocs.Length); + + // Order by Doc Id for improved perf! + // See https://cwiki.apache.org/confluence/display/lucene/ImproveSearchingSpeed + foreach (var scoreDoc in topDocs.ScoreDocs.OrderBy(x => x.Doc)) { - var result = GetSearchResult(i, topDocs, searcher.IndexSearcher); + var result = GetSearchResult(scoreDoc, topDocs, searcher.IndexSearcher); results.Add(result); } - var searchAfterOptions = GetSearchAfterOptions(topDocs); + + var searchAfterOptions = scoreDocAfter != null ? GetSearchAfterOptions(topDocs) : null; float maxScore = topDocs.MaxScore; return new LuceneSearchResults(results, totalItemCount, maxScore, searchAfterOptions); @@ -174,21 +178,12 @@ internal static SearchAfterOptions GetSearchAfterOptions(TopDocs topDocs) return new SearchAfterOptions(scoreDoc.Doc, scoreDoc.Score, new object[0], scoreDoc.ShardIndex); } } + return null; } - private LuceneSearchResult GetSearchResult(int index, TopDocs topDocs, IndexSearcher luceneSearcher) + private LuceneSearchResult GetSearchResult(ScoreDoc scoreDoc, TopDocs topDocs, IndexSearcher luceneSearcher) { - // I have seen IndexOutOfRangeException here which is strange as this is only called in one place - // and from that one place "i" is always less than the size of this collection. - // but we'll error check here anyways - if (topDocs?.ScoreDocs.Length < index) - { - return null; - } - - var scoreDoc = topDocs.ScoreDocs[index]; - var docId = scoreDoc.Doc; Document doc; if (_fieldsToLoad != null) @@ -199,6 +194,7 @@ private LuceneSearchResult GetSearchResult(int index, TopDocs topDocs, IndexSear { doc = luceneSearcher.Doc(docId); } + var score = scoreDoc.Score; var shardIndex = scoreDoc.ShardIndex; var result = CreateSearchResult(doc, score, shardIndex); @@ -222,12 +218,12 @@ internal static LuceneSearchResult CreateSearchResult(Document doc, float score, var searchResult = new LuceneSearchResult(id, score, () => { - //we can use lucene to find out the fields which have been stored for this particular document + //we can use Lucene to find out the fields which have been stored for this particular document var fields = doc.Fields; var resultVals = new Dictionary>(); - foreach (var field in fields.Cast()) + foreach (var field in fields) { var fieldName = field.Name; var values = doc.GetValues(fieldName); diff --git a/src/Examine.Lucene/Search/LuceneSearchResults.cs b/src/Examine.Lucene/Search/LuceneSearchResults.cs index 6ddc01a44..f945f5403 100644 --- a/src/Examine.Lucene/Search/LuceneSearchResults.cs +++ b/src/Examine.Lucene/Search/LuceneSearchResults.cs @@ -2,6 +2,8 @@ using System.Collections; using System.Collections.Generic; +#nullable enable + namespace Examine.Lucene.Search { public class LuceneSearchResults : ILuceneSearchResults @@ -10,7 +12,11 @@ public class LuceneSearchResults : ILuceneSearchResults private readonly IReadOnlyCollection _results; - public LuceneSearchResults(IReadOnlyCollection results, int totalItemCount,float maxScore, SearchAfterOptions searchAfterOptions) + public LuceneSearchResults( + IReadOnlyCollection results, + int totalItemCount, + float maxScore, + SearchAfterOptions? searchAfterOptions) { _results = results; TotalItemCount = totalItemCount; @@ -26,9 +32,10 @@ public LuceneSearchResults(IReadOnlyCollection results, int total /// public float MaxScore { get; } - public SearchAfterOptions SearchAfter { get; } + public SearchAfterOptions? SearchAfter { get; } public IEnumerator GetEnumerator() => _results.GetEnumerator(); + IEnumerator IEnumerable.GetEnumerator() => GetEnumerator(); } }