Skip to content

Commit

Permalink
Lucene.Net.Index.Term::ToString(): Optimized writing UTF8 string on t…
Browse files Browse the repository at this point in the history
…arget frameworks that support System.Text.Unicode.Utf8. Added tests to verify fallback is working.
  • Loading branch information
NightOwl888 committed Jan 8, 2025
1 parent f7a9186 commit fcc412e
Show file tree
Hide file tree
Showing 4 changed files with 113 additions and 2 deletions.
7 changes: 7 additions & 0 deletions Directory.Build.targets
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@

</PropertyGroup>

<!-- Features in .NET 8.x and .NET 9.x only -->
<PropertyGroup Condition=" $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">

<DefineConstants>$(DefineConstants);FEATURE_UTF8_TOUTF16</DefineConstants>

</PropertyGroup>

<!-- Features in .NET 6.x, .NET 7.x, .NET 8.x, and .NET 9.x only -->
<PropertyGroup Condition=" $(TargetFramework.StartsWith('net6.')) Or $(TargetFramework.StartsWith('net7.')) Or $(TargetFramework.StartsWith('net8.')) Or $(TargetFramework.StartsWith('net9.')) ">

Expand Down
60 changes: 59 additions & 1 deletion src/Lucene.Net.Tests/Index/TestTerm.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using Lucene.Net.Attributes;
using Lucene.Net.Util;
using NUnit.Framework;
using Assert = Lucene.Net.TestFramework.Assert;

Expand Down Expand Up @@ -39,5 +41,61 @@ public virtual void TestEquals()
Assert.IsFalse(@base.Equals(differentText));
Assert.IsFalse(@base.Equals(differentType));
}

[Test, LuceneNetSpecific]
public void TestToString_ValidUtf8Data()
{
// Arrange
var validUtf8 = new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F }; // "Hello"
var bytesRef = new BytesRef(validUtf8, 0, validUtf8.Length);

// Act
string result = Term.ToString(bytesRef);

// Assert
Assert.AreEqual("Hello", result);
}

[Test, LuceneNetSpecific]
public void TestToString_InvalidUtf8Data()
{
// Arrange
var invalidUtf8 = new byte[] { 0xC3, 0x28 }; // Invalid UTF-8 sequence
var bytesRef = new BytesRef(invalidUtf8, 0, invalidUtf8.Length);

// Act
string result = Term.ToString(bytesRef);

// Assert
Assert.AreEqual("[c3 28]", result); // Should match BytesRef.ToString()
}

[Test, LuceneNetSpecific]
public void TestToString_Utf8WithBom()
{
// Arrange
var utf8WithBom = new byte[] { 0xEF, 0xBB, 0xBF, 0x48, 0x69 }; // BOM + "Hi"
var bytesRef = new BytesRef(utf8WithBom, 0, utf8WithBom.Length);

// Act
string result = Term.ToString(bytesRef);

// Assert
Assert.AreEqual("\uFEFFHi", result); // BOM is preserved in the string
}

[Test, LuceneNetSpecific]
public void TestToString_Utf8WithoutBom()
{
// Arrange
var utf8WithoutBom = new byte[] { 0x48, 0x69 }; // "Hi"
var bytesRef = new BytesRef(utf8WithoutBom, 0, utf8WithoutBom.Length);

// Act
string result = Term.ToString(bytesRef);

// Assert
Assert.AreEqual("Hi", result);
}
}
}
}
2 changes: 1 addition & 1 deletion src/Lucene.Net.Tests/Support/TestApiConsistency.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public override void TestProtectedFieldNames(Type typeFromTargetAssembly)
[TestCase(typeof(Lucene.Net.Analysis.Analyzer))]
public override void TestPrivateFieldNames(Type typeFromTargetAssembly)
{
base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)");
base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)|CharStackBufferSize$");
}

[Test, LuceneNetSpecific]
Expand Down
46 changes: 46 additions & 0 deletions src/Lucene.Net/Index/Term.cs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
using J2N.Text;
using Lucene.Net.Support;
using Lucene.Net.Support.Buffers;
using Lucene.Net.Support.Text;
using System;
using System.Buffers;
using System.Text;

namespace Lucene.Net.Index
Expand Down Expand Up @@ -35,6 +37,8 @@ namespace Lucene.Net.Index
/// </summary>
public sealed class Term : IComparable<Term>, IEquatable<Term> // LUCENENET specific - class implements IEquatable<T>
{
private const int CharStackBufferSize = 64;

/// <summary>
/// Constructs a <see cref="Term"/> with the given field and bytes.
/// <para/>Note that a null field or null bytes value results in undefined
Expand Down Expand Up @@ -85,12 +89,52 @@ public Term(string fld)
/// </summary>
public string Text => ToString(Bytes); // LUCENENET: Changed to a property. While this calls a method internally, its expected usage is that it will return a deterministic value.

#nullable enable
/// <summary>
/// Returns human-readable form of the term text. If the term is not unicode,
/// the raw bytes will be printed instead.
/// </summary>
public static string ToString(BytesRef termText)
{
if (termText is null)
throw new ArgumentNullException(nameof(termText)); // LUCENENET: Added guard clause
#if FEATURE_UTF8_TOUTF16
// View the relevant portion of the byte array
ReadOnlySpan<byte> utf8Span = new ReadOnlySpan<byte>(termText.Bytes, termText.Offset, termText.Length);

// Allocate a buffer for the maximum possible UTF-16 output
int maxChars = utf8Span.Length; // Worst case: 1 byte -> 1 char (ASCII)
char[]? arrayToReturnToPool = null;

Span<char> charBuffer = maxChars > CharStackBufferSize
? (arrayToReturnToPool = ArrayPool<char>.Shared.Rent(maxChars))
: stackalloc char[CharStackBufferSize];
try
{
// Decode the UTF-8 bytes to UTF-16 chars
OperationStatus status = System.Text.Unicode.Utf8.ToUtf16(
utf8Span,
charBuffer,
out int bytesConsumed,
out int charsWritten,
replaceInvalidSequences: false); // Causes OperationStatus.InvalidData to occur rather than replace

// NOTE: We handle OperationStatus.InvalidData below in the fallback path.
if (status == OperationStatus.Done)
{
// Successfully decoded the UTF-8 input
return charBuffer.Slice(0, charsWritten).ToString();
}
}
finally
{
// Return the buffer to the pool
ArrayPool<char>.Shared.ReturnIfNotNull(arrayToReturnToPool);
}

// Fallback to the default string representation if decoding fails
return termText.ToString();
#else
// the term might not be text, but usually is. so we make a best effort
Encoding decoder = StandardCharsets.UTF_8.WithDecoderExceptionFallback();
try
Expand All @@ -101,7 +145,9 @@ public static string ToString(BytesRef termText)
{
return termText.ToString();
}
#endif
}
#nullable restore

/// <summary>
/// Returns the bytes of this term.
Expand Down

0 comments on commit fcc412e

Please sign in to comment.