From 95966c7f886a7b7d81f0f39af51c297300164360 Mon Sep 17 00:00:00 2001 From: Paul Irwin Date: Tue, 17 Dec 2024 13:22:46 -0700 Subject: [PATCH] Remove offset/length parameters from Span-based methods, #1024 --- src/Lucene.Net/Util/UnicodeUtil.cs | 198 +++++++++-------------------- 1 file changed, 60 insertions(+), 138 deletions(-) diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 75af2b53ea..af53ec15d5 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -159,7 +159,7 @@ public static void UTF16toUTF8(ReadOnlySpan source, BytesRef result) while (i < end) { - int code = (int)source[i++]; + var code = (int)source[i++]; if (code < 0x80) { @@ -227,100 +227,7 @@ public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef r if (source is null) throw new ArgumentNullException(nameof(source)); - UTF16toUTF8(source.AsSpan(), offset, length, result); - } - - /// - /// Encode characters from a (with generic type argument ) , starting at - /// for chars. After encoding, result.Offset will always be 0. - /// - /// is null. - /// - /// or is less than zero. - /// - /// -or- - /// - /// and refer to a location outside of . - /// - /// - /// LUCENENET specific overload. - /// - // TODO: broken if incoming result.offset != 0 - public static void UTF16toUTF8(ReadOnlySpan source, int offset, int length, BytesRef result) - { - // LUCENENET: Added guard clauses - if (result is null) - throw new ArgumentNullException(nameof(result)); - if (offset < 0) - throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative."); - if (length < 0) - throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); - if (offset > source.Length - length) // Checks for int overflow - throw new ArgumentOutOfRangeException(nameof(length), - $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); - - int upto = 0; - int i = offset; - int end = offset + length; - var @out = result.Bytes; - // Pre-allocate for worst case 4-for-1 - int maxLen = length * 4; - if (@out.Length < maxLen) - { - @out = result.Bytes = new byte[maxLen]; - } - - result.Offset = 0; - - while (i < end) - { - int code = (int)source[i++]; - - if (code < 0x80) - { - @out[upto++] = (byte)code; - } - else if (code < 0x800) - { - @out[upto++] = (byte)(0xC0 | (code >> 6)); - @out[upto++] = (byte)(0x80 | (code & 0x3F)); - } - else if (code < 0xD800 || code > 0xDFFF) - { - @out[upto++] = (byte)(0xE0 | (code >> 12)); - @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); - @out[upto++] = (byte)(0x80 | (code & 0x3F)); - } - else - { - // surrogate pair - // confirm valid high surrogate - if (code < 0xDC00 && i < end) - { - var utf32 = (int)source[i]; - // confirm valid low surrogate and write pair - if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) - { - utf32 = (code << 10) + utf32 + SURROGATE_OFFSET; - i++; - @out[upto++] = (byte)(0xF0 | (utf32 >> 18)); - @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); - @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); - @out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); - continue; - } - } - - // replace unpaired surrogate or out-of-order low surrogate - // with substitution character - @out[upto++] = 0xEF; - @out[upto++] = 0xBF; - @out[upto++] = 0xBD; - } - } - - //assert matches(source, offset, length, out, upto); - result.Length = upto; + UTF16toUTF8(source.AsSpan(offset, length), result); } /// @@ -599,8 +506,8 @@ public static bool ValidUTF16String(ICharSequence s) return true; } - public static bool - ValidUTF16String(string s) // LUCENENET specific overload because string doesn't implement ICharSequence + // LUCENENET specific overload because string doesn't implement ICharSequence + public static bool ValidUTF16String(string s) { int size = s.Length; for (int i = 0; i < size; i++) @@ -638,9 +545,8 @@ public static bool return true; } - public static bool - ValidUTF16String( - StringBuilder s) // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence + // LUCENENET specific overload because StringBuilder doesn't implement ICharSequence + public static bool ValidUTF16String(StringBuilder s) { int size = s.Length; for (int i = 0; i < size; i++) @@ -725,8 +631,8 @@ public static bool ValidUTF16String(ReadOnlySpan s, int size) * means illegal prefix. see RFC 2279 for details */ internal static readonly int[] utf8CodeLength = LoadUTF8CodeLength(); - private static int[] - LoadUTF8CodeLength() // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) + // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) + private static int[] LoadUTF8CodeLength() { const int v = int.MinValue; return new int[] @@ -947,7 +853,7 @@ public static string NewString(ReadOnlySpan codePoints, int offset, int cou // instead proactively resizes the array instead of relying on exceptions + copy operations public static char[] ToCharArray(int[] codePoints, int offset, int count) { - return ToCharArray(codePoints.AsSpan(), offset, count); + return ToCharArray(codePoints.AsSpan(offset), count); } /// @@ -956,13 +862,12 @@ public static char[] ToCharArray(int[] codePoints, int offset, int count) /// LUCENENET specific. /// /// The code span. - /// The start of the text in the code point span. /// The number of code points. /// a char array representing the code points between offset and count. // LUCENENET NOTE: This code was originally in the NewString() method (above). // It has been refactored from the original to remove the exception throw/catch and // instead proactively resizes the array instead of relying on exceptions + copy operations - public static char[] ToCharArray(ReadOnlySpan codePoints, int offset, int count) + public static char[] ToCharArray(ReadOnlySpan codePoints, int count) { if (count < 0) { @@ -977,7 +882,7 @@ public static char[] ToCharArray(ReadOnlySpan codePoints, int offset, int c if (count > countThreshold) { arrayLength = 0; - for (int r = offset, e = offset + count; r < e; ++r) + for (int r = 0; r < count; ++r) { arrayLength += codePoints[r] < 0x010000 ? 1 : 2; } @@ -990,7 +895,7 @@ public static char[] ToCharArray(ReadOnlySpan codePoints, int offset, int c // It is now safe to assume we have enough space for all of the characters. char[] chars = new char[arrayLength]; int w = 0; - for (int r = offset, e = offset + count; r < e; ++r) + for (int r = 0; r < count; ++r) { int cp = codePoints[r]; if (cp < 0 || cp > 0x10ffff) @@ -1064,10 +969,11 @@ public static string ToHexString(string s) /// can result in an if invalid UTF-8 is passed). /// Explicit checks for valid UTF-8 are not performed. /// + /// // TODO: broken if chars.offset != 0 public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef chars) { - UTF8toUTF16(utf8.AsSpan(), offset, length, chars); + UTF8toUTF16(utf8.AsSpan(offset, length), chars); } /// @@ -1082,14 +988,15 @@ public static void UTF8toUTF16(byte[] utf8, int offset, int length, CharsRef cha /// LUCENENET specific overload. /// // TODO: broken if chars.offset != 0 - public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, CharsRef chars) + public static void UTF8toUTF16(ReadOnlySpan utf8, CharsRef chars) { int out_offset = chars.Offset = 0; - char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, length); - int limit = offset + length; - while (offset < limit) + char[] @out = chars.Chars = ArrayUtil.Grow(chars.Chars, utf8.Length); + int i = 0; + + while (i < utf8.Length) { - int b = utf8[offset++] & 0xff; + int b = utf8[i++] & 0xff; if (b < 0xc0) { if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); @@ -1097,18 +1004,18 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, } else if (b < 0xe0) { - @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); } else if (b < 0xf0) { - @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); - offset += 2; + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); + i += 2; } else { if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); - int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); - offset += 3; + int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); + i += 3; if (ch < UNI_MAX_BMP) { @out[out_offset++] = (char)ch; @@ -1132,21 +1039,36 @@ public static void UTF8toUTF16(ReadOnlySpan utf8, int offset, int length, /// /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. /// - public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars) + /// + public static bool TryUTF8toUTF16(byte[] utf8, int offset, int length, [NotNullWhen(true)] out CharsRef? chars) { - CharsRef result = new CharsRef(length); + return TryUTF8toUTF16(utf8.AsSpan(offset, length), out chars); + } + + /// + /// Tries to interpret the given byte span as UTF-8 and convert to UTF-16, providing the result in a new . + /// + /// NOTE: Explicit checks for valid UTF-8 are not performed. + /// + /// + /// LUCENENET specific: This method uses (with generic type argument ) instead of byte[]. + /// + public static bool TryUTF8toUTF16(ReadOnlySpan utf8, [NotNullWhen(true)] out CharsRef? chars) + { + CharsRef result = new CharsRef(utf8.Length); int out_offset = 0; char[] @out = result.Chars; - int limit = offset + length; - while (offset < limit) + int i = 0; + + while (i < utf8.Length) { - if (utf8.Length <= offset) + if (utf8.Length <= i) { chars = null; return false; } - int b = utf8[offset++] & 0xff; + int b = utf8[i++] & 0xff; if (b < 0xc0) { if (Debugging.AssertsEnabled) Debugging.Assert(b < 0x80); @@ -1154,33 +1076,33 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt } else if (b < 0xe0) { - if (utf8.Length <= offset) + if (utf8.Length <= i) { chars = null; return false; } - @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[offset++] & 0x3f)); + @out[out_offset++] = (char)(((b & 0x1f) << 6) + (utf8[i++] & 0x3f)); } else if (b < 0xf0) { - if (utf8.Length <= offset + 1) + if (utf8.Length <= i + 1) { chars = null; return false; } - @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[offset] & 0x3f) << 6) + (utf8[offset + 1] & 0x3f)); - offset += 2; + @out[out_offset++] = (char)(((b & 0xf) << 12) + ((utf8[i] & 0x3f) << 6) + (utf8[i + 1] & 0x3f)); + i += 2; } else { - if (utf8.Length <= offset + 2) + if (utf8.Length <= i + 2) { chars = null; return false; } if (Debugging.AssertsEnabled) Debugging.Assert(b < 0xf8, "b = 0x{0:x}", b); - int ch = ((b & 0x7) << 18) + ((utf8[offset] & 0x3f) << 12) + ((utf8[offset + 1] & 0x3f) << 6) + (utf8[offset + 2] & 0x3f); - offset += 3; + int ch = ((b & 0x7) << 18) + ((utf8[i] & 0x3f) << 12) + ((utf8[i + 1] & 0x3f) << 6) + (utf8[i + 2] & 0x3f); + i += 3; if (ch < UNI_MAX_BMP) { @out[out_offset++] = (char)ch; @@ -1199,20 +1121,20 @@ public static bool TryUTF8toUTF16(ReadOnlySpan utf8, int offset, int lengt } /// - /// Utility method for - /// + /// Utility method for + /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void UTF8toUTF16(BytesRef bytesRef, CharsRef chars) { - UTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, chars); + UTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), chars); } /// - /// Utility method for - /// + /// Utility method for + /// public static bool TryUTF8toUTF16(BytesRef bytesRef, out CharsRef? chars) { - return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(), bytesRef.Offset, bytesRef.Length, out chars); + return TryUTF8toUTF16(bytesRef.Bytes.AsSpan(bytesRef.Offset, bytesRef.Length), out chars); } } }