diff --git a/Directory.Build.targets b/Directory.Build.targets
index fd71ab0554..1dc7daa031 100644
--- a/Directory.Build.targets
+++ b/Directory.Build.targets
@@ -37,6 +37,13 @@
+
+
+
+ $(DefineConstants);FEATURE_UTF8_TOUTF16
+
+
+
diff --git a/src/Lucene.Net.Tests/Index/TestTerm.cs b/src/Lucene.Net.Tests/Index/TestTerm.cs
index 425670dcd6..577a3781e7 100644
--- a/src/Lucene.Net.Tests/Index/TestTerm.cs
+++ b/src/Lucene.Net.Tests/Index/TestTerm.cs
@@ -1,3 +1,5 @@
+using Lucene.Net.Attributes;
+using Lucene.Net.Util;
using NUnit.Framework;
using Assert = Lucene.Net.TestFramework.Assert;
@@ -39,5 +41,61 @@ public virtual void TestEquals()
Assert.IsFalse(@base.Equals(differentText));
Assert.IsFalse(@base.Equals(differentType));
}
+
+ [Test, LuceneNetSpecific]
+ public void TestToString_ValidUtf8Data()
+ {
+ // Arrange
+ var validUtf8 = new byte[] { 0x48, 0x65, 0x6C, 0x6C, 0x6F }; // "Hello"
+ var bytesRef = new BytesRef(validUtf8, 0, validUtf8.Length);
+
+ // Act
+ string result = Term.ToString(bytesRef);
+
+ // Assert
+ Assert.AreEqual("Hello", result);
+ }
+
+ [Test, LuceneNetSpecific]
+ public void TestToString_InvalidUtf8Data()
+ {
+ // Arrange
+ var invalidUtf8 = new byte[] { 0xC3, 0x28 }; // Invalid UTF-8 sequence
+ var bytesRef = new BytesRef(invalidUtf8, 0, invalidUtf8.Length);
+
+ // Act
+ string result = Term.ToString(bytesRef);
+
+ // Assert
+ Assert.AreEqual("[c3 28]", result); // Should match BytesRef.ToString()
+ }
+
+ [Test, LuceneNetSpecific]
+ public void TestToString_Utf8WithBom()
+ {
+ // Arrange
+ var utf8WithBom = new byte[] { 0xEF, 0xBB, 0xBF, 0x48, 0x69 }; // BOM + "Hi"
+ var bytesRef = new BytesRef(utf8WithBom, 0, utf8WithBom.Length);
+
+ // Act
+ string result = Term.ToString(bytesRef);
+
+ // Assert
+ Assert.AreEqual("\uFEFFHi", result); // BOM is preserved in the string
+ }
+
+ [Test, LuceneNetSpecific]
+ public void TestToString_Utf8WithoutBom()
+ {
+ // Arrange
+ var utf8WithoutBom = new byte[] { 0x48, 0x69 }; // "Hi"
+ var bytesRef = new BytesRef(utf8WithoutBom, 0, utf8WithoutBom.Length);
+
+ // Act
+ string result = Term.ToString(bytesRef);
+
+ // Assert
+ Assert.AreEqual("Hi", result);
+ }
}
-}
\ No newline at end of file
+}
diff --git a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
index f565676ac7..04b96b91bc 100644
--- a/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
+++ b/src/Lucene.Net.Tests/Support/TestApiConsistency.cs
@@ -38,7 +38,7 @@ public override void TestProtectedFieldNames(Type typeFromTargetAssembly)
[TestCase(typeof(Lucene.Net.Analysis.Analyzer))]
public override void TestPrivateFieldNames(Type typeFromTargetAssembly)
{
- base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)");
+ base.TestPrivateFieldNames(typeFromTargetAssembly, @"^Lucene\.Net\.Support\.(?:ConcurrentHashSet|PlatformHelper|DateTimeOffsetUtil|Arrays|IO\.FileSupport)|^Lucene\.ExceptionExtensions|^Lucene\.Net\.Util\.Constants\.MaxStackByteLimit|^Lucene\.Net\.Search\.TopDocs\.ShardByteSize|^Lucene\.Net\.Store\.BaseDirectory\.(?:True|False)|CharStackBufferSize$");
}
[Test, LuceneNetSpecific]
diff --git a/src/Lucene.Net/Index/Term.cs b/src/Lucene.Net/Index/Term.cs
index 2de523a20a..6930fa5430 100644
--- a/src/Lucene.Net/Index/Term.cs
+++ b/src/Lucene.Net/Index/Term.cs
@@ -1,7 +1,9 @@
using J2N.Text;
using Lucene.Net.Support;
+using Lucene.Net.Support.Buffers;
using Lucene.Net.Support.Text;
using System;
+using System.Buffers;
using System.Text;
namespace Lucene.Net.Index
@@ -35,6 +37,8 @@ namespace Lucene.Net.Index
///
public sealed class Term : IComparable, IEquatable // LUCENENET specific - class implements IEquatable
{
+ private const int CharStackBufferSize = 64;
+
///
/// Constructs a with the given field and bytes.
/// Note that a null field or null bytes value results in undefined
@@ -85,12 +89,52 @@ public Term(string fld)
///
public string Text => ToString(Bytes); // LUCENENET: Changed to a property. While this calls a method internally, its expected usage is that it will return a deterministic value.
+#nullable enable
///
/// Returns human-readable form of the term text. If the term is not unicode,
/// the raw bytes will be printed instead.
///
public static string ToString(BytesRef termText)
{
+ if (termText is null)
+ throw new ArgumentNullException(nameof(termText)); // LUCENENET: Added guard clause
+#if FEATURE_UTF8_TOUTF16
+ // View the relevant portion of the byte array
+ ReadOnlySpan utf8Span = new ReadOnlySpan(termText.Bytes, termText.Offset, termText.Length);
+
+ // Allocate a buffer for the maximum possible UTF-16 output
+ int maxChars = utf8Span.Length; // Worst case: 1 byte -> 1 char (ASCII)
+ char[]? arrayToReturnToPool = null;
+
+ Span charBuffer = maxChars > CharStackBufferSize
+ ? (arrayToReturnToPool = ArrayPool.Shared.Rent(maxChars))
+ : stackalloc char[CharStackBufferSize];
+ try
+ {
+ // Decode the UTF-8 bytes to UTF-16 chars
+ OperationStatus status = System.Text.Unicode.Utf8.ToUtf16(
+ utf8Span,
+ charBuffer,
+ out int bytesConsumed,
+ out int charsWritten,
+ replaceInvalidSequences: false); // Causes OperationStatus.InvalidData to occur rather than replace
+
+ // NOTE: We handle OperationStatus.InvalidData below in the fallback path.
+ if (status == OperationStatus.Done)
+ {
+ // Successfully decoded the UTF-8 input
+ return charBuffer.Slice(0, charsWritten).ToString();
+ }
+ }
+ finally
+ {
+ // Return the buffer to the pool
+ ArrayPool.Shared.ReturnIfNotNull(arrayToReturnToPool);
+ }
+
+ // Fallback to the default string representation if decoding fails
+ return termText.ToString();
+#else
// the term might not be text, but usually is. so we make a best effort
Encoding decoder = StandardCharsets.UTF_8.WithDecoderExceptionFallback();
try
@@ -101,7 +145,9 @@ public static string ToString(BytesRef termText)
{
return termText.ToString();
}
+#endif
}
+#nullable restore
///
/// Returns the bytes of this term.