Skip to content

Commit

Permalink
Remove offset/length parameters from Span-based methods, #1024
Browse files Browse the repository at this point in the history
  • Loading branch information
paulirwin committed Dec 17, 2024
1 parent 0afee05 commit 95966c7
Showing 1 changed file with 60 additions and 138 deletions.
198 changes: 60 additions & 138 deletions src/Lucene.Net/Util/UnicodeUtil.cs
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ public static void UTF16toUTF8(ReadOnlySpan<char> source, BytesRef result)

while (i < end)
{
int code = (int)source[i++];
var code = (int)source[i++];

if (code < 0x80)
{
Expand Down Expand Up @@ -227,100 +227,7 @@ public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef r
if (source is null)
throw new ArgumentNullException(nameof(source));

UTF16toUTF8(source.AsSpan(), offset, length, result);
}

/// <summary>
/// Encode characters from a <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="char"/>) <paramref name="source"/>, starting at
/// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0.
/// </summary>
/// <exception cref="ArgumentNullException"><paramref name="result"/> is <c>null</c>.</exception>
/// <exception cref="ArgumentOutOfRangeException">
/// <paramref name="offset"/> or <paramref name="length"/> is less than zero.
/// <para/>
/// -or-
/// <para/>
/// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>.
/// </exception>
/// <remarks>
/// LUCENENET specific overload.
/// </remarks>
// TODO: broken if incoming result.offset != 0
public static void UTF16toUTF8(ReadOnlySpan<char> source, int offset, int length, BytesRef result)
{
// LUCENENET: Added guard clauses
if (result is null)
throw new ArgumentNullException(nameof(result));
if (offset < 0)
throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative.");
if (length < 0)
throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative.");
if (offset > source.Length - length) // Checks for int overflow
throw new ArgumentOutOfRangeException(nameof(length),
$"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}.");

int upto = 0;
int i = offset;
int end = offset + length;
var @out = result.Bytes;
// Pre-allocate for worst case 4-for-1
int maxLen = length * 4;
if (@out.Length < maxLen)
{
@out = result.Bytes = new byte[maxLen];
}

result.Offset = 0;

while (i < end)
{
int code = (int)source[i++];

if (code < 0x80)
{
@out[upto++] = (byte)code;
}
else if (code < 0x800)
{
@out[upto++] = (byte)(0xC0 | (code >> 6));
@out[upto++] = (byte)(0x80 | (code & 0x3F));
}
else if (code < 0xD800 || code > 0xDFFF)
{
@out[upto++] = (byte)(0xE0 | (code >> 12));
@out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F));
@out[upto++] = (byte)(0x80 | (code & 0x3F));
}
else
{
// surrogate pair
// confirm valid high surrogate
if (code < 0xDC00 && i < end)
{
var utf32 = (int)source[i];
// confirm valid low surrogate and write pair
if (utf32 >= 0xDC00 && utf32 <= 0xDFFF)
{
utf32 = (code << 10) + utf32 + SURROGATE_OFFSET;
i++;
@out[upto++] = (byte)(0xF0 | (utf32 >> 18));
@out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F));
@out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F));
@out[upto++] = (byte)(0x80 | (utf32 & 0x3F));
continue;
}
}

// replace unpaired surrogate or out-of-order low surrogate
// with substitution character
@out[upto++] = 0xEF;
@out[upto++] = 0xBF;
@out[upto++] = 0xBD;
}
}

//assert matches(source, offset, length, out, upto);
result.Length = upto;
UTF16toUTF8(source.AsSpan(offset, length), result);
}

/// <summary>
Expand Down Expand Up @@ -599,8 +506,8 @@ public static bool ValidUTF16String(ICharSequence s)
return true;
}

public static bool
ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence
// LUCENENET specific overload because string doesn't implement ICharSequence
public static bool ValidUTF16String(string s)
{
int size = s.Length;
for (int i = 0; i < size; i++)
Expand Down Expand Up @@ -638,9 +545,8 @@ public static bool
return true;
}

public static bool
ValidUTF16String(
StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
// LUCENENET specific overload because StringBuilder doesn't implement ICharSequence
public static bool ValidUTF16String(StringBuilder s)
{
int size = s.Length;
for (int i = 0; i < size; i++)
Expand Down Expand Up @@ -725,8 +631,8 @@ public static bool ValidUTF16String(ReadOnlySpan<char> s, int size)
* means illegal prefix. see RFC 2279 for details */
internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength();

private static int[]
LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
// LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
private static int[] LoadUTF8CodeLength()
{
const int v = int.MinValue;
return new int[]
Expand Down Expand Up @@ -947,7 +853,7 @@ public static string NewString(ReadOnlySpan<int> codePoints, int offset, int cou
// instead proactively resizes the array instead of relying on exceptions + copy operations
public static char[] ToCharArray(int[] codePoints, int offset, int count)
{
return ToCharArray(codePoints.AsSpan(), offset, count);
return ToCharArray(codePoints.AsSpan(offset), count);
}

/// <summary>
Expand All @@ -956,13 +862,12 @@ public static char[] ToCharArray(int[] codePoints, int offset, int count)
/// LUCENENET specific.
/// </summary>
/// <param name="codePoints"> The code span. </param>
/// <param name="offset"> The start of the text in the code point span. </param>
/// <param name="count"> The number of code points. </param>
/// <returns> a char array representing the code points between offset and count. </returns>
// LUCENENET NOTE: This code was originally in the NewString() method (above).
// It has been refactored from the original to remove the exception throw/catch and
// instead proactively resizes the array instead of relying on exceptions + copy operations
public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int offset, int count)
public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int count)
{
if (count < 0)
{
Expand All @@ -977,7 +882,7 @@ public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int offset, int c
if (count > countThreshold)
{
arrayLength = 0;
for (int r = offset, e = offset + count; r < e; ++r)
for (int r = 0; r < count; ++r)
{
arrayLength += codePoints[r] < 0x010000 ? 1 : 2;
}
Expand All @@ -990,7 +895,7 @@ public static char[] ToCharArray(ReadOnlySpan<int> codePoints, int offset, int c
// It is now safe to assume we have enough space for all of the characters.
char[] chars = new char[arrayLength];
int w = 0;
for (int r = offset, e = offset + count; r < e; ++r)
for (int r = 0; r < count; ++r)
{
int cp = codePoints[r];
if (cp < 0 || cp > 0x10ffff)
Expand Down Expand Up @@ -1064,10 +969,11 @@ public static string ToHexString(string s)
/// can result in an <see cref="IndexOutOfRangeException"/> if invalid UTF-8 is passed).
/// Explicit checks for valid UTF-8 are not performed.
/// </summary>
/// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/>
// TODO: broken if chars.offset != 0
public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars)
{
UTF8toUTF16(utf8.AsSpan(), offset, length, chars);
UTF8toUTF16(utf8.AsSpan(offset, length), chars);
}

/// <summary>
Expand All @@ -1082,33 +988,34 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha
/// LUCENENET specific overload.
/// </remarks>
// TODO: broken if chars.offset != 0
public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, CharsRef chars)
public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, CharsRef chars)
{
int out_offset = chars.Offset = 0;
char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length);
int limit = offset + length;
while (offset < limit)
char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length);
int i = 0;

while (i < utf8.Length)
{
int b = utf8[offset++] & 0xff;
int b = utf8[i++] & 0xff;
if (b < 0xc0)
{
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
@out[out_offset++] = (char)b;
}
else if (b < 0xe0)
{
@out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
@out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
}
else if (b < 0xf0)
{
@out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
offset += 2;
@out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
i += 2;
}
else
{
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
offset += 3;
int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
i += 3;
if (ch < UNI_MAX_BMP)
{
@out[out_offset++] = (char)ch;
Expand All @@ -1132,55 +1039,70 @@ public static void UTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length,
/// <remarks>
/// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
/// </remarks>
public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars)
/// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/>
public static bool TryUTF8toUTF16(byte[] utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars)
{
CharsRef result = new CharsRef(length);
return TryUTF8toUTF16(utf8.AsSpan(offset, length), out chars);
}

/// <summary>
/// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new <see cref="CharsRef"/>.
/// <para/>
/// NOTE: Explicit checks for valid UTF-8 are not performed.
/// </summary>
/// <remarks>
/// LUCENENET specific: This method uses <see cref="ReadOnlySpan{T}"/> (with generic type argument <see cref="byte"/>) instead of byte[].
/// </remarks>
public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, [NotNullWhen(true)] out CharsRef? chars)
{
CharsRef result = new CharsRef(utf8.Length);
int out_offset = 0;
char[] @out = result.Chars;
int limit = offset + length;
while (offset < limit)
int i = 0;

while (i < utf8.Length)
{
if (utf8.Length <= offset)
if (utf8.Length <= i)
{
chars = null;
return false;
}

int b = utf8[offset++] & 0xff;
int b = utf8[i++] & 0xff;
if (b < 0xc0)
{
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80);
@out[out_offset++] = (char)b;
}
else if (b < 0xe0)
{
if (utf8.Length <= offset)
if (utf8.Length <= i)
{
chars = null;
return false;
}
@out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f));
@out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f));
}
else if (b < 0xf0)
{
if (utf8.Length <= offset + 1)
if (utf8.Length <= i + 1)
{
chars = null;
return false;
}
@out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f));
offset += 2;
@out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f));
i += 2;
}
else
{
if (utf8.Length <= offset + 2)
if (utf8.Length <= i + 2)
{
chars = null;
return false;
}
if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b);
int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f);
offset += 3;
int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f);
i += 3;
if (ch < UNI_MAX_BMP)
{
@out[out_offset++] = (char)ch;
Expand All @@ -1199,20 +1121,20 @@ public static bool TryUTF8toUTF16(ReadOnlySpan<byte> utf8, int offset, int lengt
}

/// <summary>
/// Utility method for <see cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/> </summary>
/// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, int, int, CharsRef)"/>
/// Utility method for <see cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/> </summary>
/// <seealso cref="UTF8toUTF16(ReadOnlySpan{byte}, CharsRef)"/>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars)
{
UTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, chars);
UTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), chars);
}

/// <summary>
/// Utility method for <see cref="TryUTF8toUTF16(ReadOnlySpan{byte}, int, int, out CharsRef)"/> </summary>
/// <seealso cref="TryUTF8toUTF16(ReadOnlySpan{byte}, int, int, out CharsRef)"/>
/// Utility method for <see cref="TryUTF8toUTF16(ReadOnlySpan{byte}, out CharsRef)"/> </summary>
/// <seealso cref="TryUTF8toUTF16(ReadOnlySpan{byte}, out CharsRef)"/>
public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars)
{
return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, out chars);
return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), out chars);
}
}
}

0 comments on commit 95966c7

Please sign in to comment.