From fcc412e370f3975f386b9d74710348bd7b736103 Mon Sep 17 00:00:00 2001 From: Shad Storhaug Date: Thu, 9 Jan 2025 02:09:39 +0700 Subject: [PATCH] Lucene.Net.Index.Term::ToString(): Optimized writing UTF8 string on target frameworks that support System.Text.Unicode.Utf8. Added tests to verify fallback is working. --- Directory.Build.targets | 7 +++ src/Lucene.Net.Tests/Index/TestTerm.cs | 60 ++++++++++++++++++- .../Support/TestApiConsistency.cs | 2 +- src/Lucene.Net/Index/Term.cs | 46 ++++++++++++++ 4 files changed, 113 insertions(+), 2 deletions(-) diff --git a/Directory.Build.targets b/Directory.Build.targets index fd71ab0554..1dc7daa031 100644 --- a/Directory.Build.targets +++ b/Directory.Build.targets @@ -37,6 +37,13 @@ + + + + $(DefineConstants);FEATURE_UTF8_TOUTF16 + + + diff --git a/src/Lucene.Net.Tests/Index/TestTerm.cs b/src/Lucene.Net.Tests/Index/TestTerm.cs index 425670dcd6..577a3781e7 100644 --- a/src/Lucene.Net.Tests/Index/TestTerm.cs +++ b/src/Lucene.Net.Tests/Index/TestTerm.cs @@ -1,3 +1,5 @@ +using Lucene.Net.Attributes; +using Lucene.Net.Util; using NUnit.Framework; using Assert = Lucene.Net.TestFramework.Assert; @@ -39,5 +41,61 @@ public virtual void TestEquals() Assert.IsFalse(@base.Equals(differentText)); Assert.IsFalse(@base.Equals(differentType)); } + + [Test, LuceneNetSpecific] + public void TestToString_ValidUtf8Data() + { + // Arrange + var validUtf8 = new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F }; // "Hello" + var bytesRef = new BytesRef(validUtf8, 0, validUtf8.Length); + + // Act + string result = Term.ToString(bytesRef); + + // Assert + Assert.AreEqual("Hello", result); + } + + [Test, LuceneNetSpecific] + public void TestToString_InvalidUtf8Data() + { + // Arrange + var invalidUtf8 = new byte[] { 0xC3, 0x28 }; // Invalid UTF-8 sequence + var bytesRef = new BytesRef(invalidUtf8, 0, invalidUtf8.Length); + + // Act + string result = Term.ToString(bytesRef); + + // Assert + Assert.AreEqual("[c3 28]", result); // Should match BytesRef.ToString() + } + + [Test, LuceneNetSpecific] + public void TestToString_Utf8WithBom() + { + // Arrange + var utf8WithBom = new byte[] { 0xEF, 0xBB, 0xBF, 0x48, 0x69 }; // BOM + "Hi" + var bytesRef = new BytesRef(utf8WithBom, 0, utf8WithBom.Length); + + // Act + string result = Term.ToString(bytesRef); + + // Assert + Assert.AreEqual("\uFEFFHi", result); // BOM is preserved in the string + } + + [Test, LuceneNetSpecific] + public void TestToString_Utf8WithoutBom() + { + // Arrange + var utf8WithoutBom = new byte[] { 0x48, 0x69 }; // "Hi" + var bytesRef = new BytesRef(utf8WithoutBom, 0, utf8WithoutBom.Length); + + // Act + string result = Term.ToString(bytesRef); + + // Assert + Assert.AreEqual("Hi", result); + } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs index f565676ac7..04b96b91bc 100644 --- a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs +++ b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs @@ -38,7 +38,7 @@ public override void TestProtectedFieldNames(Type typeFromTargetAssembly) [TestCase(typeof(Lucene.Net.Analysis.Analyzer))] public override void TestPrivateFieldNames(Type typeFromTargetAssembly) { - base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)"); + base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)|CharStackBufferSize$"); } [Test, LuceneNetSpecific] diff --git a/src/Lucene.Net/Index/Term.cs b/src/Lucene.Net/Index/Term.cs index 2de523a20a..6930fa5430 100644 --- a/src/Lucene.Net/Index/Term.cs +++ b/src/Lucene.Net/Index/Term.cs @@ -1,7 +1,9 @@ using J2N.Text; using Lucene.Net.Support; +using Lucene.Net.Support.Buffers; using Lucene.Net.Support.Text; using System; +using System.Buffers; using System.Text; namespace Lucene.Net.Index @@ -35,6 +37,8 @@ namespace Lucene.Net.Index /// public sealed class Term : IComparable, IEquatable // LUCENENET specific - class implements IEquatable { + private const int CharStackBufferSize = 64; + /// /// Constructs a with the given field and bytes. /// Note that a null field or null bytes value results in undefined @@ -85,12 +89,52 @@ public Term(string fld) /// public string Text => ToString(Bytes); // LUCENENET: Changed to a property. While this calls a method internally, its expected usage is that it will return a deterministic value. +#nullable enable /// /// Returns human-readable form of the term text. If the term is not unicode, /// the raw bytes will be printed instead. /// public static string ToString(BytesRef termText) { + if (termText is null) + throw new ArgumentNullException(nameof(termText)); // LUCENENET: Added guard clause +#if FEATURE_UTF8_TOUTF16 + // View the relevant portion of the byte array + ReadOnlySpan utf8Span = new ReadOnlySpan(termText.Bytes, termText.Offset, termText.Length); + + // Allocate a buffer for the maximum possible UTF-16 output + int maxChars = utf8Span.Length; // Worst case: 1 byte -> 1 char (ASCII) + char[]? arrayToReturnToPool = null; + + Span charBuffer = maxChars > CharStackBufferSize + ? (arrayToReturnToPool = ArrayPool.Shared.Rent(maxChars)) + : stackalloc char[CharStackBufferSize]; + try + { + // Decode the UTF-8 bytes to UTF-16 chars + OperationStatus status = System.Text.Unicode.Utf8.ToUtf16( + utf8Span, + charBuffer, + out int bytesConsumed, + out int charsWritten, + replaceInvalidSequences: false); // Causes OperationStatus.InvalidData to occur rather than replace + + // NOTE: We handle OperationStatus.InvalidData below in the fallback path. + if (status == OperationStatus.Done) + { + // Successfully decoded the UTF-8 input + return charBuffer.Slice(0, charsWritten).ToString(); + } + } + finally + { + // Return the buffer to the pool + ArrayPool.Shared.ReturnIfNotNull(arrayToReturnToPool); + } + + // Fallback to the default string representation if decoding fails + return termText.ToString(); +#else // the term might not be text, but usually is. so we make a best effort Encoding decoder = StandardCharsets.UTF_8.WithDecoderExceptionFallback(); try @@ -101,7 +145,9 @@ public static string ToString(BytesRef termText) { return termText.ToString(); } +#endif } +#nullable restore /// /// Returns the bytes of this term.