From c043cbdd6b410e51c8a1c35ca39fa03a877e6e90 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sat, 11 Jan 2025 06:11:10 +0700 Subject: [PATCH 01/12] Add support for System.Text.Unicode.Utf8 This commit introduces the `System.Text.Unicode.Utf8` type to the `Microsoft.Bcl.Memory` library. It includes type forwarding for `Utf8` in `Microsoft.Bcl.Memory.Forwards.cs`, updates the documentation in `PACKAGE.md` to include `Utf8` functionality, and adds corresponding test cases in `Microsoft.Bcl.Memory.Tests.csproj`. The documentation now emphasizes `Utf8` alongside `Index`, `Range`, and `Base64Url`, highlighting its role in converting data between UTF-8 and UTF-16 encodings. --- .../src/Microsoft.Bcl.Memory.Forwards.cs | 3 +++ src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md | 4 ++++ .../tests/Microsoft.Bcl.Memory.Tests.csproj | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs index bf96db3ece9e41..e3fdeb43392ee0 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs @@ -3,6 +3,9 @@ [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Index))] [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Range))] +#if NET8_0_OR_GREATER +[assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Text.Unicode.Utf8))] +#endif #if NET9_0_OR_GREATER [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Buffers.Text.Base64Url))] #endif diff --git a/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md b/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md index eac081364db99d..af99a7b334b050 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md +++ b/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md @@ -2,6 +2,7 @@ Provides `Index` and `Range` types to simplify slicing operations on collections for .NET Framework and .NET Standard 2.0. Provides `Base64Url` for encoding data in a URL-safe manner on .NET Framework and .NET Standard. +Provides `Utf8` for converting chunked data between UTF-8 and UTF-16 encodings on .NET Framework and .NET Standard. This library is not necessary nor recommended when targeting versions of .NET that include the relevant support. @@ -11,6 +12,7 @@ This library is not necessary nor recommended when targeting versions of .NET th * Enables the use of `Index` and `Range` types on older .NET platforms. * Provides `Base64Url` encoding, decoding, and validation for URL-safe data processing on older .NET platforms. +* Provides `Utf8` encoding, decoding, and validation for chunked data between UTF-8 and UTF-16 on older .NET platforms. ## How to Use @@ -64,6 +66,7 @@ The main types provided by this library are: * `System.Index` * `System.Range` * `System.Buffers.Text.Base64Url` +* `System.Text.Unicode.Utf8` ## Additional Documentation @@ -74,6 +77,7 @@ API documentation * [System.Index](https://learn.microsoft.com/dotnet/api/system.index) * [System.Range](https://learn.microsoft.com/dotnet/api/system.range) * [System.Buffers.Text.Base64Url](https://learn.microsoft.com/dotnet/api/system.buffers.text.base64url) +* [System.Text.Unicode.Utf8](https://learn.microsoft.com/dotnet/api/system.text.unicode.utf8) ## Feedback & Contributing diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index b7458269bedff7..8aea9932efadde 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -32,6 +32,13 @@ + + + + System\Text\Unicode\Utf8Tests.cs + + + From f60df7c2558efb4217cfbdd4cd6f56ff97da47f6 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sat, 11 Jan 2025 19:49:41 +0700 Subject: [PATCH 02/12] Enhance UTF-8 and Unicode support in BCL Memory Updated `PackageDescription` to include "Utf8" support. Added new `ItemGroup` for conditional compilation of UTF-8 and Unicode handling files for non-net8.0 frameworks. Modified visibility and implementations in `Ascii.Utility.Helpers.cs`, `Utf8.cs`, and `Utf8Utility` based on `MICROSOFT_BCL_MEMORY` define. --- .../src/Microsoft.Bcl.Memory.csproj | 14 +- .../src/System/Text/Ascii.Utility.Helpers.cs | 45 +++- .../src/System/Text/Ascii.Utility.cs | 43 +++- .../src/System/Text/Unicode/Utf8.cs | 198 +++++++++++++++++- .../Text/Unicode/Utf8Utility.Helpers.cs | 4 + .../Text/Unicode/Utf8Utility.Transcoding.cs | 10 + .../Text/Unicode/Utf8Utility.Validation.cs | 16 +- .../src/System/Text/Unicode/Utf8Utility.cs | 4 + .../src/System/Text/UnicodeUtility.cs | 10 + 9 files changed, 336 insertions(+), 8 deletions(-) diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj index c542c078d20cb6..4f9a741965fd28 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj @@ -5,7 +5,7 @@ true $(DefineConstants);MICROSOFT_BCL_MEMORY true - Provides Base64Url, Index and Range types support for .NET Framework and .NET Standard. + Provides Base64Url, Utf8, Index and Range types support for .NET Framework and .NET Standard. @@ -27,6 +27,18 @@ + + + + + + + + + + + + diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs index 30467a1843a32a..5d22d8a29f5f6b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs @@ -4,10 +4,16 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace System.Text { - public static partial class Ascii +#if !MICROSOFT_BCL_MEMORY + public +#else + internal +#endif + static partial class Ascii { /// /// A mask which selects only the high bit of each byte of the given . @@ -44,7 +50,11 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat if (BitConverter.IsLittleEndian) { +#if !MICROSOFT_BCL_MEMORY return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; +#else + return (uint)TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; +#endif } else { @@ -60,22 +70,55 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat // expensive. Instead we'll just change how we perform the shifts. // Read first byte +#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 1); +#else + value = (value << 1) | (value >> (32 - 1)); +#endif uint allBytesUpToNowAreAscii = value & 1; uint numAsciiBytes = allBytesUpToNowAreAscii; // Read second byte +#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 8); +#else + value = (value << 8) | (value >> (32 - 8)); +#endif allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; // Read third byte +#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 8); +#else + value = (value << 8) | (value >> (32 - 8)); +#endif allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; return numAsciiBytes; } } + +#if MICROSOFT_BCL_MEMORY + private static ReadOnlySpan TrailingZeroCountDeBruijn => // 32 + [ + 00, 01, 28, 02, 29, 14, 24, 03, + 30, 22, 20, 15, 25, 17, 04, 08, + 31, 27, 13, 23, 21, 19, 16, 07, + 26, 12, 18, 06, 11, 05, 10, 09 + ]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int TrailingZeroCount(uint value) + { + // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check + return Unsafe.AddByteOffset( + // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u + ref MemoryMarshal.GetReference(TrailingZeroCountDeBruijn), + // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here + (IntPtr)(int)(((value & (uint)-(int)value) * 0x077CB531u) >> 27)); // Multi-cast mitigates redundant conv.u8 + } +#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 580550467fe589..61fe8e4a55df3e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -4,14 +4,21 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +#if !MICROSOFT_BCL_MEMORY using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; +#endif namespace System.Text { - public static partial class Ascii +#if !MICROSOFT_BCL_MEMORY + public +#else + internal +#endif + static partial class Ascii { /// /// Returns iff all bytes in are ASCII. @@ -53,6 +60,7 @@ private static bool AllCharsInUInt64AreAscii(ulong value) : AllCharsInUInt64AreAscii(value); } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128 value, Vector128 bitmask) @@ -75,6 +83,7 @@ private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128 val Debug.Assert((mask != 0) ? index < 16 : index >= 16); return index; } +#endif /// /// Given a DWORD which represents two packed chars in machine-endian order, @@ -102,6 +111,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. +#if !MICROSOFT_BCL_MEMORY if (!Vector512.IsHardwareAccelerated && !Vector256.IsHardwareAccelerated && (Sse2.IsSupported || AdvSimd.IsSupported)) @@ -109,7 +119,9 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu return GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength); } else +#endif { + // Handles Vector512, Vector256, Vector128, and scalar. return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength); } @@ -128,6 +140,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu // Note use of SBYTE instead of BYTE below; we're using the two's-complement // representation of negative integers to act as a surrogate for "is ASCII?". +#if !MICROSOFT_BCL_MEMORY if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) { if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0) @@ -236,6 +249,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu bufferLength += (nuint)pOriginalBuffer; } } +#endif // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code @@ -332,6 +346,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu goto Finish; } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool ContainsNonAsciiByte_Sse2(uint sseMask) { @@ -702,6 +717,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuff goto Finish; } +#endif /// /// Returns the index in where the first non-ASCII char is found. @@ -716,6 +732,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bu // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. +#if !MICROSOFT_BCL_MEMORY if (!Vector512.IsHardwareAccelerated && !Vector256.IsHardwareAccelerated && (Sse2.IsSupported || AdvSimd.IsSupported)) @@ -723,6 +740,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bu return GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength); } else +#endif { // Handles Vector512, Vector256, Vector128, and scalar. return GetIndexOfFirstNonAsciiChar_Vector(pBuffer, bufferLength); @@ -740,6 +758,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char)); #endif +#if !MICROSOFT_BCL_MEMORY // Before we drain off char-by-char, try a generic vectorized loop. // Only run the loop if we have at least two vectors we can pull out. if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) @@ -849,7 +868,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); } } - +#endif // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code @@ -932,6 +951,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu goto Finish; } +#if !MICROSOFT_BCL_MEMORY private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuffer, nuint bufferLength /* in chars */) { // This method contains logic optimized using vector instructions for both x64 and Arm64. @@ -1235,6 +1255,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff goto Finish; } +#endif /// /// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order, @@ -1246,6 +1267,7 @@ private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputB { Debug.Assert(AllCharsInUInt64AreAscii(value)); +#if !MICROSOFT_BCL_MEMORY if (Sse2.X64.IsSupported) { // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes @@ -1264,8 +1286,8 @@ private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputB Vector64 lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(vecWide); Unsafe.WriteUnaligned(ref outputBuffer, lower.AsUInt32().ToScalar()); } - else +#endif { if (BitConverter.IsLittleEndian) { @@ -1325,6 +1347,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0; ulong utf16Data64Bits = 0; +#if !MICROSOFT_BCL_MEMORY if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector128.Count) { // Since there's overhead to setting up the vectorized code path, we only want to @@ -1361,6 +1384,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii currentOffset = NarrowUtf16ToAscii_Intrinsified(pUtf16Buffer, pAsciiBuffer, elementCount); } } +#endif Debug.Assert(currentOffset <= elementCount); nuint remainingElementCount = elementCount - currentOffset; @@ -1496,6 +1520,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii goto Finish; } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) { @@ -2032,6 +2057,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff goto Finish; } +#endif /// /// Copies as many ASCII bytes (00..7F) as possible from @@ -2044,6 +2070,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B // Intrinsified in mono interpreter nuint currentOffset = 0; +#if !MICROSOFT_BCL_MEMORY if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128.Count) { if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512.Count) @@ -2059,6 +2086,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B WidenAsciiToUtf1_Vector, Vector128>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); } } +#endif Debug.Assert(currentOffset <= elementCount); nuint remainingElementCount = elementCount - currentOffset; @@ -2149,7 +2177,11 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B { while ((asciiData & 0x80000000) == 0) { +#if !MICROSOFT_BCL_MEMORY asciiData = BitOperations.RotateLeft(asciiData, 8); +#else + asciiData = (asciiData << 8) | (asciiData >> (32 - 8)); ; +#endif pUtf16Buffer[currentOffset] = (char)(byte)asciiData; currentOffset++; } @@ -2158,6 +2190,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B goto Finish; } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void WidenAsciiToUtf1_Vector(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount) where TVectorByte : unmanaged, ISimdVector @@ -2235,7 +2268,7 @@ private static (TVectorUInt16 Lower, TVectorUInt16 Upper) Widen /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and @@ -2246,6 +2279,7 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB { Debug.Assert(AllBytesInUInt32AreAscii(value)); +#if !MICROSOFT_BCL_MEMORY if (AdvSimd.Arm64.IsSupported) { Vector128 vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte(); @@ -2259,6 +2293,7 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB Unsafe.WriteUnaligned(ref Unsafe.As(ref outputBuffer), vecWide.ToScalar()); } else +#endif { if (BitConverter.IsLittleEndian) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs index cb4cd8bde42216..630d8db69ede89 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -9,7 +9,10 @@ namespace System.Text.Unicode { -#if SYSTEM_PRIVATE_CORELIB +#if SYSTEM_PRIVATE_CORELIB || MICROSOFT_BCL_MEMORY + /// + /// Provides methods for transcoding between UTF-8 and UTF-16. + /// public #else internal @@ -200,7 +203,11 @@ public static unsafe OperationStatus ToUtf16(ReadOnlySpan source, Span source, Span source, Span destination, out int bytesRead, out int charsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { // NOTE: Changes to this method should be kept in sync with ToUtf16 above. @@ -807,6 +815,7 @@ private bool Fail() return false; } } +#endif /// /// Validates that the value is well-formed UTF-8. @@ -815,5 +824,192 @@ private bool Fail() /// true if value is well-formed UTF-8, false otherwise. public static bool IsValid(ReadOnlySpan value) => Utf8Utility.GetIndexOfFirstInvalidUtf8Sequence(value, out _) < 0; + +#if MICROSOFT_BCL_MEMORY + /// + /// Decodes the Rune at the beginning of the provided UTF-8 source buffer. + /// + /// + /// + /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns , + /// and outs via the decoded Runeand via the + /// number of s used in the input buffer to encode the Rune. + /// + /// + /// If the source buffer is empty or contains only a partial UTF-8 subsequence, returns , + /// and outs via ReplacementChar and via the length of the input buffer. + /// + /// + /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns , + /// and outs via ReplacementChar and via the number of + /// s used in the input buffer to encode the ill-formed sequence. + /// + /// + /// + /// The general calling convention is to call this method in a loop, slicing the buffer by + /// elements on each iteration of the loop. On each iteration of the loop + /// will contain the real scalar value if successfully decoded, or it will contain ReplacementChar if + /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of + /// invalid sequences while iterating through the loop. + /// + private static OperationStatus DecodeFromUtf8(ReadOnlySpan source, out uint result, out int bytesConsumed) + { + // This method follows the Unicode Standard's recommendation for detecting + // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, + // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, + // it tries to consume as many code units as possible as long as those code + // units constitute the beginning of a longer well-formed subsequence per Table 3-7. + + // Try reading source[0]. + + int index = 0; + if (source.IsEmpty) + { + goto NeedsMoreData; + } + + uint tempValue = source[0]; + if (UnicodeUtility.IsAsciiCodePoint(tempValue)) + { + bytesConsumed = 1; + result = tempValue; + return OperationStatus.Done; + } + + // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in + // the range [C2..F4]. If it's outside of that range, it's either a standalone + // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range + // four-byte sequence. + + // Try reading source[1]. + + index = 1; + if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) + { + goto Invalid; + } + + tempValue = (tempValue - 0xC2) << 6; + + if (source.Length <= 1) + { + goto NeedsMoreData; + } + + // Continuation bytes are of the form [10xxxxxx], which means that their two's + // complement representation is in the range [-65..-128]. This allows us to + // perform a single comparison to see if a byte is a continuation byte. + + int thisByteSignExtended = (sbyte)source[1]; + if (thisByteSignExtended >= -64) + { + goto Invalid; + } + + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker + + if (tempValue < 0x0800) + { + Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); + goto Finish; // this is a valid 2-byte sequence + } + + // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have + // enough information (from just two code units) to detect overlong or surrogate + // sequences, we need to perform these checks now. + + if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) + { + // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. + // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. + goto Invalid; + } + + if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) + { + // This is a UTF-16 surrogate code point, which is invalid in UTF-8. + goto Invalid; + } + + if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) + { + // This is an overlong 4-byte sequence. + goto Invalid; + } + + // The first two bytes were just fine. We don't need to perform any other checks + // on the remaining bytes other than to see that they're valid continuation bytes. + + // Try reading source[2]. + + index = 2; + if (source.Length <= 2) + { + goto NeedsMoreData; + } + + thisByteSignExtended = (sbyte)source[2]; + if (thisByteSignExtended >= -64) + { + goto Invalid; // this byte is not a UTF-8 continuation byte + } + + tempValue <<= 6; + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker + + if (tempValue <= 0xFFFF) + { + Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); + goto Finish; // this is a valid 3-byte sequence + } + + // Try reading source[3]. + + index = 3; + if (source.Length <= 3) + { + goto NeedsMoreData; + } + + thisByteSignExtended = (sbyte)source[3]; + if (thisByteSignExtended >= -64) + { + goto Invalid; // this byte is not a UTF-8 continuation byte + } + + tempValue <<= 6; + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker + + // Valid 4-byte sequence + //UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); + + Finish: + + bytesConsumed = index + 1; + Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] + result = tempValue; + return OperationStatus.Done; + + NeedsMoreData: + + Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 + bytesConsumed = index; + result = (char)UnicodeUtility.ReplacementChar; + return OperationStatus.NeedMoreData; + + Invalid: + + Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 + bytesConsumed = index; + result = (char)UnicodeUtility.ReplacementChar; + return OperationStatus.InvalidData; + } +#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs index b615ee5edd0ad1..25f1f130a01fc8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -115,7 +115,11 @@ private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value) value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] +#if !MICROSOFT_BCL_MEMORY tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] +#else + tempA = (tempA << 16) | (tempA >> (32 - 16)); ; // = [ 00xxxxxx 00000000 00000000 00000uuu ] +#endif uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs index 21645204289880..e11572e43c1afa 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -6,9 +6,11 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +#if !MICROSOFT_BCL_MEMORY using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; +#endif namespace System.Text.Unicode { @@ -598,7 +600,11 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ]. +#if !MICROSOFT_BCL_MEMORY toCheck = BitOperations.RotateRight(toCheck, 8); +#else + toCheck = (toCheck >> 8) | (toCheck << (32 - 8)); +#endif // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ]. @@ -878,12 +884,14 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt // vector is only used in those code paths, we leave it uninitialized if SSE4.1 // is not enabled. +#if !MICROSOFT_BCL_MEMORY Vector128 nonAsciiUtf16DataMask; if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) { nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char } +#endif // Begin the main loop. @@ -938,6 +946,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2; uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); +#if !MICROSOFT_BCL_MEMORY if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) { // Try reading and writing 8 elements per iteration. @@ -1081,6 +1090,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt goto AfterReadDWordSkipAllCharsAsciiCheck; } else +#endif { // Can't use SSE41 x64, so we'll only read and write 4 elements per iteration. uint maxIters = minElementsRemaining / 4; diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index a542dad72b5c33..a36f64066602ec 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -5,9 +5,11 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +#if !MICROSOFT_BCL_MEMORY using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; +#endif namespace System.Text.Unicode { @@ -113,13 +115,14 @@ internal static unsafe partial class Utf8Utility // the alignment check consumes at most a single DWORD.) byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here - nuint trailingZeroCount; // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're // going to perform an unaligned load. We don't necessarily care about aligning // this because we pessimistically assume we'll encounter non-ASCII data at some // point in the not-too-distant future (otherwise we would've stayed entirely // within the all-ASCII vectorized code at the entry to this method). +#if !MICROSOFT_BCL_MEMORY + nuint trailingZeroCount; if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) { // declare bitMask128 inside of the AdvSimd.Arm64.IsSupported check @@ -140,9 +143,11 @@ internal static unsafe partial class Utf8Utility } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop); } else +#endif { do { +#if !MICROSOFT_BCL_MEMORY if (Sse2.IsSupported) { uint mask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pInputBuffer)); @@ -153,6 +158,7 @@ internal static unsafe partial class Utf8Utility } } else +#endif { if (!Ascii.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[0] | ((uint*)pInputBuffer)[1])) { @@ -171,6 +177,7 @@ internal static unsafe partial class Utf8Utility continue; // need to perform a bounds check because we might be running out of data +#if !MICROSOFT_BCL_MEMORY LoopTerminatedEarlyDueToNonAsciiData: // x86 can only be little endian, while ARM can be big or little endian // so if we reached this label we need to check both combinations are supported @@ -192,6 +199,7 @@ internal static unsafe partial class Utf8Utility thisDWord = Unsafe.ReadUnaligned(pInputBuffer); // no longer guaranteed to be aligned goto BeforeProcessTwoByteSequence; +#endif LoopTerminatedEarlyDueToNonAsciiDataInSecondPair: @@ -597,7 +605,11 @@ internal static unsafe partial class Utf8Utility // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). +#if !MICROSOFT_BCL_MEMORY thisDWord = BitOperations.RotateRight(thisDWord, 8); +#else + thisDWord = (thisDWord >> 8) | (thisDWord << (32 - 8)); +#endif // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ]. // The check is now a simple add / cmp / jcc combo. @@ -739,6 +751,7 @@ internal static unsafe partial class Utf8Utility return pInputBuffer; } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bitMask128) @@ -753,5 +766,6 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); return extractedBits.AsUInt64().ToScalar(); } +#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs index 9708c32e49e24d..ed26341a9e67f7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs @@ -4,7 +4,9 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if !MICROSOFT_BCL_MEMORY using System.Runtime.Intrinsics; +#endif namespace System.Text.Unicode { @@ -251,6 +253,7 @@ internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB) return (valueA | letterMaskA) == (valueB | letterMaskB); } +#if !MICROSOFT_BCL_MEMORY /// /// Returns true iff the Vector128 represents 16 ASCII UTF-8 characters in machine endianness. /// @@ -288,5 +291,6 @@ internal static bool Vector128OrdinalIgnoreCaseAscii(Vector128 vec1, Vecto // Compare two lowercased vectors return (lcVec1 ^ lcVec2) == Vector128.Zero; } +#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs index eeccfc57597920..af179916fb8c99 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs @@ -17,7 +17,9 @@ internal static class UnicodeUtility /// public static int GetPlane(uint codePoint) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidCodePoint(codePoint); +#endif return (int)(codePoint >> 16); } @@ -27,8 +29,10 @@ public static int GetPlane(uint codePoint) /// public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, uint lowSurrogateCodePoint) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsHighSurrogateCodePoint(highSurrogateCodePoint); UnicodeDebug.AssertIsLowSurrogateCodePoint(lowSurrogateCodePoint); +#endif // This calculation comes from the Unicode specification, Table 3-5. // Need to remove the D800 marker from the high surrogate and the DC00 marker from the low surrogate, @@ -43,7 +47,9 @@ public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, /// public static int GetUtf16SequenceLength(uint value) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidScalar(value); +#endif value -= 0x10000; // if value < 0x10000, high byte = 0xFF; else high byte = 0x00 value += (2 << 24); // if value < 0x10000, high byte = 0x01; else high byte = 0x02 @@ -57,7 +63,9 @@ public static int GetUtf16SequenceLength(uint value) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, out char highSurrogateCodePoint, out char lowSurrogateCodePoint) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(value); +#endif // This calculation comes from the Unicode specification, Table 3-5. @@ -70,7 +78,9 @@ public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, ou /// public static int GetUtf8SequenceLength(uint value) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidScalar(value); +#endif // The logic below can handle all valid scalar values branchlessly. // It gives generally good performance across all inputs, and on x86 From 5b86cd6588558d60c3f4ad389315cda10319d720 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sat, 11 Jan 2025 21:49:51 +0700 Subject: [PATCH 03/12] Enhance Unicode handling in tests and project structure Updated `Microsoft.Bcl.Memory.Tests.csproj` to include `UnicodeUtility.cs` and removed .NET 8.0 targeting condition. Modified `Utf8Tests.cs` by adjusting using directives and enhancing the `DecodeHex` method with conditional compilation for .NET 5.0+. --- .../tests/Microsoft.Bcl.Memory.Tests.csproj | 5 +- .../System/Text/Unicode/Utf8Tests.cs | 222 +++++++++++++++++- 2 files changed, 220 insertions(+), 7 deletions(-) diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index 8aea9932efadde..6d3ad15b117bf5 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -30,10 +30,7 @@ System\IndexTests.cs - - - - + System\Text\Unicode\Utf8Tests.cs diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs index a24438dcbb59f9..4a997b6313f94b 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs @@ -2,10 +2,9 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Collections; using System.Collections.Generic; -using System.Globalization; using System.Linq; -using System.Text.RegularExpressions; using Xunit; namespace System.Text.Unicode.Tests @@ -64,13 +63,45 @@ public static byte[] DecodeHex(ReadOnlySpan inputHex) { Assert.Matches(@"^([0-9a-fA-F]{2})*$", inputHex.ToString()); +#if NET5_0_OR_GREATER return Convert.FromHexString(inputHex); +#else + byte[] result = new byte[inputHex.Length / 2]; + for (int i = 0; i < result.Length; i++) + { + var h = FromHex(inputHex[i * 2]); + var l = FromHex(inputHex[i * 2 + 1]); + result[i] = (byte)((h << 4) | l); + } + return result; + + static int FromHex(char c) + { + if (c >= '0' && c <= '9') + { + return c - '0'; + } + else if (c >= 'a' && c <= 'f') + { + return c - 'a' + 10; + } + else + { + return c - 'A' + 10; + } + } +#endif } // !! IMPORTANT !! // Don't delete this implementation, as we use it as a reference to make sure the framework's // transcoding logic is correct. - public static byte[] ToUtf8(Rune rune) +#if !MICROSOFT_BCL_MEMORY + public +#else + private +#endif + static byte[] ToUtf8(Rune rune) { Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed."); @@ -781,4 +812,189 @@ private static void ToChars_Test_Core(ReadOnlySpan utf8Input, int destinat } } } + +#if !NETCOREAPP3_0_OR_GREATER + internal readonly struct Rune //: IComparable, IComparable, IEquatable + { + private readonly uint _value; + + public Rune(uint value) + { + //if (!UnicodeUtility.IsValidUnicodeScalar(value)) + //{ + // ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value); + //} + _value = value; + } + + public Rune(int value) + : this((uint)value) + { + } + private Rune(uint scalarValue, bool _) + { + //UnicodeDebug.AssertIsValidScalar(scalarValue); + _value = scalarValue; + } + + internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false); + + public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value); + + public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar); + + public int Utf16SequenceLength + { + get + { + int codeUnitCount = UnicodeUtility.GetUtf16SequenceLength(_value); + //Debug.Assert(codeUnitCount > 0 && codeUnitCount <= MaxUtf16CharsPerRune); + return codeUnitCount; + } + } + + public int Utf8SequenceLength + { + get + { + int codeUnitCount = UnicodeUtility.GetUtf8SequenceLength(_value); + //Debug.Assert(codeUnitCount > 0 && codeUnitCount <= MaxUtf8BytesPerRune); + return codeUnitCount; + } + } + + public int Value => (int)_value; + + public static bool IsValid(int value) => IsValid((uint)value); + public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value); + + public static bool TryGetRuneAt(string input, int index, out Rune value) + { + int runeValue = ReadRuneFromString(input, index); + if (runeValue >= 0) + { + value = UnsafeCreate((uint)runeValue); + return true; + } + else + { + value = default; + return false; + } + } + + private static int ReadRuneFromString(string input, int index) + { + if (input is null) + { + throw new ArgumentNullException(nameof(input)); + } + + if ((uint)index >= (uint)input.Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + // Optimistically assume input is within BMP. + + uint returnValue = input[index]; + if (UnicodeUtility.IsSurrogateCodePoint(returnValue)) + { + if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue)) + { + return -1; + } + + // Treat 'returnValue' as the high surrogate. + // + // If this becomes a hot code path, we can skip the below bounds check by reading + // off the end of the string using unsafe code. Since strings are null-terminated, + // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if + // the string terminates unexpectedly. + + index++; + if ((uint)index >= (uint)input.Length) + { + return -1; // not an argument exception - just a "bad data" failure + } + + uint potentialLowSurrogate = input[index]; + if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate)) + { + return -1; + } + + returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate); + } + + return (int)returnValue; + } + } + + internal static class StringExtensions + { + public static StringRuneEnumerator EnumerateRunes(this string value) => new StringRuneEnumerator(value); + } + + // An enumerator for retrieving System.Text.Rune instances from a System.String. + internal struct StringRuneEnumerator : IEnumerable, IEnumerator + { + private readonly string _string; + private Rune _current; + private int _nextIndex; + + internal StringRuneEnumerator(string value) + { + _string = value; + _current = default; + _nextIndex = 0; + } + + public Rune Current => _current; + + public StringRuneEnumerator GetEnumerator() => this; + + public bool MoveNext() + { + if ((uint)_nextIndex >= _string.Length) + { + // reached the end of the string + _current = default; + return false; + } + + if (!Rune.TryGetRuneAt(_string, _nextIndex, out _current)) + { + // replace invalid sequences with U+FFFD + _current = Rune.ReplacementChar; + } + + // In UTF-16 specifically, invalid sequences always have length 1, which is the same + // length as the replacement character U+FFFD. This means that we can always bump the + // next index by the current scalar's UTF-16 sequence length. This optimization is not + // generally applicable; for example, enumerating scalars from UTF-8 cannot utilize + // this same trick. + + _nextIndex += _current.Utf16SequenceLength; + return true; + } + + object? IEnumerator.Current => _current; + + void IDisposable.Dispose() + { + // no-op + } + + IEnumerator IEnumerable.GetEnumerator() => this; + + IEnumerator IEnumerable.GetEnumerator() => this; + + void IEnumerator.Reset() + { + _current = default; + _nextIndex = 0; + } + } +#endif } From 22be776fc9faa7086ee028944e70dbba92e71b0e Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sat, 11 Jan 2025 21:52:40 +0700 Subject: [PATCH 04/12] Add compilation constant for Microsoft BCL Memory Added a new property `$(DefineConstants);MICROSOFT_BCL_MEMORY` to the project file to define a new compilation constant for the project. --- .../Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index 6d3ad15b117bf5..ea4445668b6858 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -3,6 +3,7 @@ $(NetFrameworkMinimum);$(NetCoreAppCurrent) true + $(DefineConstants);MICROSOFT_BCL_MEMORY From 79214bd9bff06d08f17cfcb906c6782a5c83d450 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 08:29:54 +0700 Subject: [PATCH 05/12] Apply suggestions from code review Co-authored-by: Theodore Tsirpanis --- .../src/System/Text/Ascii.Utility.Helpers.cs | 2 +- .../System.Private.CoreLib/src/System/Text/Ascii.Utility.cs | 5 ++--- .../System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs | 2 +- .../System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs index 5d22d8a29f5f6b..15cf86b085a157 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs @@ -8,7 +8,7 @@ namespace System.Text { -#if !MICROSOFT_BCL_MEMORY +#if SYSTEM_PRIVATE_CORELIB public #else internal diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 61fe8e4a55df3e..69bf2f5f308fc0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -13,7 +13,7 @@ namespace System.Text { -#if !MICROSOFT_BCL_MEMORY +#if SYSTEM_PRIVATE_CORELIB public #else internal @@ -60,7 +60,7 @@ private static bool AllCharsInUInt64AreAscii(ulong value) : AllCharsInUInt64AreAscii(value); } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128 value, Vector128 bitmask) @@ -121,7 +121,6 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu else #endif { - // Handles Vector512, Vector256, Vector128, and scalar. return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength); } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs index 630d8db69ede89..0a89968503a93d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -987,7 +987,7 @@ private static OperationStatus DecodeFromUtf8(ReadOnlySpan source, out uin tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker // Valid 4-byte sequence - //UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); + UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); Finish: diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs index 4a997b6313f94b..a2407957fc20a9 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs @@ -833,7 +833,7 @@ public Rune(int value) } private Rune(uint scalarValue, bool _) { - //UnicodeDebug.AssertIsValidScalar(scalarValue); + UnicodeDebug.AssertIsValidScalar(scalarValue); _value = scalarValue; } From def40c463fb428acf834e6427739fba3ec1036f0 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 11:10:09 +0700 Subject: [PATCH 06/12] Update Microsoft.Bcl.Memory for framework compatibility - Added polyfill for System.Numerics.BitOperations for .NET Standard 2.0. --- .../src/Microsoft.Bcl.Memory.Forwards.cs | 2 +- .../src/Microsoft.Bcl.Memory.csproj | 16 ++-- .../Microsoft.Bcl.Memory/src/PACKAGE.md | 8 +- ...em.Numerics.BitOperations.netstandard20.cs | 75 +++++++++++++++++++ .../tests/Microsoft.Bcl.Memory.Tests.csproj | 1 + .../src/System/Text/Ascii.Utility.Helpers.cs | 37 --------- .../src/System/Text/Ascii.Utility.cs | 30 ++++---- .../src/System/Text/Unicode/Utf16Utility.cs | 3 +- .../src/System/Text/Unicode/Utf8.cs | 6 +- .../Text/Unicode/Utf8Utility.Helpers.cs | 4 - .../Text/Unicode/Utf8Utility.Transcoding.cs | 10 +-- .../Text/Unicode/Utf8Utility.Validation.cs | 14 ++-- .../src/System/Text/Unicode/Utf8Utility.cs | 4 +- .../src/System/Text/UnicodeUtility.cs | 10 --- .../System/Text/Unicode/Utf8Tests.cs | 2 +- 15 files changed, 118 insertions(+), 104 deletions(-) create mode 100644 src/libraries/Microsoft.Bcl.Memory/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs index e3fdeb43392ee0..d21745f7321c29 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs @@ -3,7 +3,7 @@ [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Index))] [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Range))] -#if NET8_0_OR_GREATER +#if NET [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Text.Unicode.Utf8))] #endif #if NET9_0_OR_GREATER diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj index 4f9a741965fd28..67eac34a58cd01 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj @@ -27,16 +27,18 @@ - + + + + + + + + - - - - - @@ -44,7 +46,7 @@ - + diff --git a/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md b/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md index af99a7b334b050..c00e6bfbc7fb9b 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md +++ b/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md @@ -1,8 +1,8 @@ ## About Provides `Index` and `Range` types to simplify slicing operations on collections for .NET Framework and .NET Standard 2.0. -Provides `Base64Url` for encoding data in a URL-safe manner on .NET Framework and .NET Standard. -Provides `Utf8` for converting chunked data between UTF-8 and UTF-16 encodings on .NET Framework and .NET Standard. +Provides `Base64Url` for encoding data in a URL-safe manner on older .NET platforms. +Provides `Utf8` for converting chunked data between UTF-8 and UTF-16 encodings on .NET Framework and .NET Standard 2.0. This library is not necessary nor recommended when targeting versions of .NET that include the relevant support. @@ -10,9 +10,9 @@ This library is not necessary nor recommended when targeting versions of .NET th -* Enables the use of `Index` and `Range` types on older .NET platforms. +* Enables the use of `Index` and `Range` types on .NET Framework and .NET Standard 2.0. * Provides `Base64Url` encoding, decoding, and validation for URL-safe data processing on older .NET platforms. -* Provides `Utf8` encoding, decoding, and validation for chunked data between UTF-8 and UTF-16 on older .NET platforms. +* Provides `Utf8` encoding, decoding, and validation for chunked data between UTF-8 and UTF-16 on .NET Framework and .NET Standard 2.0. ## How to Use diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs b/src/libraries/Microsoft.Bcl.Memory/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs new file mode 100644 index 00000000000000..815d1a85732d42 --- /dev/null +++ b/src/libraries/Microsoft.Bcl.Memory/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs @@ -0,0 +1,75 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// Contains a polyfill implementation of System.Numerics.BitOperations that works on netstandard2.0. +// Implementation copied from: +// https://github.com/dotnet/runtime/blob/82ab89241b90ca3d64b22971f3a1e248da72828a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs +// +// Some routines inspired by the Stanford Bit Twiddling Hacks by Sean Eron Anderson: +// http://graphics.stanford.edu/~seander/bithacks.html + +namespace System.Numerics +{ + internal static class BitOperations + { + // C# no-alloc optimization that directly wraps the data section of the dll (similar to string constants) + // https://github.com/dotnet/roslyn/pull/24621 + + private static ReadOnlySpan TrailingZeroCountDeBruijn => // 32 + [ + 00, 01, 28, 02, 29, 14, 24, 03, + 30, 22, 20, 15, 25, 17, 04, 08, + 31, 27, 13, 23, 21, 19, 16, 07, + 26, 12, 18, 06, 11, 05, 10, 09 + ]; + + /// + /// Count the number of trailing zero bits in an integer value. + /// Similar in behavior to the x86 instruction TZCNT. + /// + /// The value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int TrailingZeroCount(uint value) + { + // Unguarded fallback contract is 0->0, BSF contract is 0->undefined + if (value == 0) + { + return 32; + } + + // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check + return Unsafe.AddByteOffset( + // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u + ref MemoryMarshal.GetReference(TrailingZeroCountDeBruijn), + // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here + (IntPtr)(int)(((value & (uint)-(int)value) * 0x077CB531u) >> 27)); // Multi-cast mitigates redundant conv.u8 + } + + /// + /// Rotates the specified value left by the specified number of bits. + /// Similar in behavior to the x86 instruction ROL. + /// + /// The value to rotate. + /// The number of bits to rotate by. + /// Any value outside the range [0..31] is treated as congruent mod 32. + /// The rotated value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static uint RotateLeft(uint value, int offset) + => (value << offset) | (value >> (32 - offset)); + + /// + /// Rotates the specified value right by the specified number of bits. + /// Similar in behavior to the x86 instruction ROR. + /// + /// The value to rotate. + /// The number of bits to rotate by. + /// Any value outside the range [0..31] is treated as congruent mod 32. + /// The rotated value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static uint RotateRight(uint value, int offset) + => (value >> offset) | (value << (32 - offset)); + } +} diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index ea4445668b6858..729dbbdd3460c4 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -31,6 +31,7 @@ System\IndexTests.cs + System\Text\Unicode\Utf8Tests.cs diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs index 15cf86b085a157..ed25459c341c49 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs @@ -50,11 +50,7 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat if (BitConverter.IsLittleEndian) { -#if !MICROSOFT_BCL_MEMORY return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; -#else - return (uint)TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; -#endif } else { @@ -70,55 +66,22 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat // expensive. Instead we'll just change how we perform the shifts. // Read first byte -#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 1); -#else - value = (value << 1) | (value >> (32 - 1)); -#endif uint allBytesUpToNowAreAscii = value & 1; uint numAsciiBytes = allBytesUpToNowAreAscii; // Read second byte -#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 8); -#else - value = (value << 8) | (value >> (32 - 8)); -#endif allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; // Read third byte -#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 8); -#else - value = (value << 8) | (value >> (32 - 8)); -#endif allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; return numAsciiBytes; } } - -#if MICROSOFT_BCL_MEMORY - private static ReadOnlySpan TrailingZeroCountDeBruijn => // 32 - [ - 00, 01, 28, 02, 29, 14, 24, 03, - 30, 22, 20, 15, 25, 17, 04, 08, - 31, 27, 13, 23, 21, 19, 16, 07, - 26, 12, 18, 06, 11, 05, 10, 09 - ]; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int TrailingZeroCount(uint value) - { - // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check - return Unsafe.AddByteOffset( - // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u - ref MemoryMarshal.GetReference(TrailingZeroCountDeBruijn), - // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here - (IntPtr)(int)(((value & (uint)-(int)value) * 0x077CB531u) >> 27)); // Multi-cast mitigates redundant conv.u8 - } -#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 69bf2f5f308fc0..909bd0fb985004 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -4,7 +4,7 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -#if !MICROSOFT_BCL_MEMORY +#if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Wasm; @@ -111,7 +111,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. -#if !MICROSOFT_BCL_MEMORY +#if NET if (!Vector512.IsHardwareAccelerated && !Vector256.IsHardwareAccelerated && (Sse2.IsSupported || AdvSimd.IsSupported)) @@ -139,7 +139,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu // Note use of SBYTE instead of BYTE below; we're using the two's-complement // representation of negative integers to act as a surrogate for "is ASCII?". -#if !MICROSOFT_BCL_MEMORY +#if NET if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) { if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0) @@ -345,7 +345,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu goto Finish; } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool ContainsNonAsciiByte_Sse2(uint sseMask) { @@ -731,7 +731,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bu // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. -#if !MICROSOFT_BCL_MEMORY +#if NET if (!Vector512.IsHardwareAccelerated && !Vector256.IsHardwareAccelerated && (Sse2.IsSupported || AdvSimd.IsSupported)) @@ -757,7 +757,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char)); #endif -#if !MICROSOFT_BCL_MEMORY +#if NET // Before we drain off char-by-char, try a generic vectorized loop. // Only run the loop if we have at least two vectors we can pull out. if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) @@ -950,7 +950,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu goto Finish; } -#if !MICROSOFT_BCL_MEMORY +#if NET private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuffer, nuint bufferLength /* in chars */) { // This method contains logic optimized using vector instructions for both x64 and Arm64. @@ -1266,7 +1266,7 @@ private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputB { Debug.Assert(AllCharsInUInt64AreAscii(value)); -#if !MICROSOFT_BCL_MEMORY +#if NET if (Sse2.X64.IsSupported) { // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes @@ -1346,7 +1346,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0; ulong utf16Data64Bits = 0; -#if !MICROSOFT_BCL_MEMORY +#if NET if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector128.Count) { // Since there's overhead to setting up the vectorized code path, we only want to @@ -1519,7 +1519,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii goto Finish; } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) { @@ -2069,7 +2069,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B // Intrinsified in mono interpreter nuint currentOffset = 0; -#if !MICROSOFT_BCL_MEMORY +#if NET if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128.Count) { if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512.Count) @@ -2176,11 +2176,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B { while ((asciiData & 0x80000000) == 0) { -#if !MICROSOFT_BCL_MEMORY asciiData = BitOperations.RotateLeft(asciiData, 8); -#else - asciiData = (asciiData << 8) | (asciiData >> (32 - 8)); ; -#endif pUtf16Buffer[currentOffset] = (char)(byte)asciiData; currentOffset++; } @@ -2189,7 +2185,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B goto Finish; } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void WidenAsciiToUtf1_Vector(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount) where TVectorByte : unmanaged, ISimdVector @@ -2278,7 +2274,7 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB { Debug.Assert(AllBytesInUInt32AreAscii(value)); -#if !MICROSOFT_BCL_MEMORY +#if NET if (AdvSimd.Arm64.IsSupported) { Vector128 vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte(); diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs index 7a79a3a6592657..992a9f6bdcbab1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs @@ -3,8 +3,7 @@ using System.Diagnostics; using System.Runtime.CompilerServices; - -#if SYSTEM_PRIVATE_CORELIB +#if NET using System.Runtime.Intrinsics; #endif diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs index 0a89968503a93d..a7044fa03e8155 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -203,7 +203,7 @@ public static unsafe OperationStatus ToUtf16(ReadOnlySpan source, Span source, Span source, Span destination, out int bytesRead, out int charsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { // NOTE: Changes to this method should be kept in sync with ToUtf16 above. @@ -825,7 +825,7 @@ private bool Fail() public static bool IsValid(ReadOnlySpan value) => Utf8Utility.GetIndexOfFirstInvalidUtf8Sequence(value, out _) < 0; -#if MICROSOFT_BCL_MEMORY +#if !NET /// /// Decodes the Rune at the beginning of the provided UTF-8 source buffer. /// diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs index 25f1f130a01fc8..b615ee5edd0ad1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -115,11 +115,7 @@ private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value) value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] -#if !MICROSOFT_BCL_MEMORY tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] -#else - tempA = (tempA << 16) | (tempA >> (32 - 16)); ; // = [ 00xxxxxx 00000000 00000000 00000uuu ] -#endif uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs index e11572e43c1afa..9c4a28c83240f3 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -6,7 +6,7 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -#if !MICROSOFT_BCL_MEMORY +#if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; @@ -600,11 +600,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ]. -#if !MICROSOFT_BCL_MEMORY toCheck = BitOperations.RotateRight(toCheck, 8); -#else - toCheck = (toCheck >> 8) | (toCheck << (32 - 8)); -#endif // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ]. @@ -884,7 +880,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt // vector is only used in those code paths, we leave it uninitialized if SSE4.1 // is not enabled. -#if !MICROSOFT_BCL_MEMORY +#if NET Vector128 nonAsciiUtf16DataMask; if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) @@ -946,7 +942,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2; uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); -#if !MICROSOFT_BCL_MEMORY +#if NET if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) { // Try reading and writing 8 elements per iteration. diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index a36f64066602ec..e947b258912270 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -5,7 +5,7 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -#if !MICROSOFT_BCL_MEMORY +#if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; @@ -121,7 +121,7 @@ internal static unsafe partial class Utf8Utility // this because we pessimistically assume we'll encounter non-ASCII data at some // point in the not-too-distant future (otherwise we would've stayed entirely // within the all-ASCII vectorized code at the entry to this method). -#if !MICROSOFT_BCL_MEMORY +#if NET nuint trailingZeroCount; if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) { @@ -147,7 +147,7 @@ internal static unsafe partial class Utf8Utility { do { -#if !MICROSOFT_BCL_MEMORY +#if NET if (Sse2.IsSupported) { uint mask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pInputBuffer)); @@ -177,7 +177,7 @@ internal static unsafe partial class Utf8Utility continue; // need to perform a bounds check because we might be running out of data -#if !MICROSOFT_BCL_MEMORY +#if NET LoopTerminatedEarlyDueToNonAsciiData: // x86 can only be little endian, while ARM can be big or little endian // so if we reached this label we need to check both combinations are supported @@ -605,11 +605,7 @@ internal static unsafe partial class Utf8Utility // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). -#if !MICROSOFT_BCL_MEMORY thisDWord = BitOperations.RotateRight(thisDWord, 8); -#else - thisDWord = (thisDWord >> 8) | (thisDWord << (32 - 8)); -#endif // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ]. // The check is now a simple add / cmp / jcc combo. @@ -751,7 +747,7 @@ internal static unsafe partial class Utf8Utility return pInputBuffer; } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bitMask128) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs index ed26341a9e67f7..17a7e7d471ded1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs @@ -4,7 +4,7 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -#if !MICROSOFT_BCL_MEMORY +#if NET using System.Runtime.Intrinsics; #endif @@ -253,7 +253,7 @@ internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB) return (valueA | letterMaskA) == (valueB | letterMaskB); } -#if !MICROSOFT_BCL_MEMORY +#if NET /// /// Returns true iff the Vector128 represents 16 ASCII UTF-8 characters in machine endianness. /// diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs index af179916fb8c99..eeccfc57597920 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs @@ -17,9 +17,7 @@ internal static class UnicodeUtility /// public static int GetPlane(uint codePoint) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidCodePoint(codePoint); -#endif return (int)(codePoint >> 16); } @@ -29,10 +27,8 @@ public static int GetPlane(uint codePoint) /// public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, uint lowSurrogateCodePoint) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsHighSurrogateCodePoint(highSurrogateCodePoint); UnicodeDebug.AssertIsLowSurrogateCodePoint(lowSurrogateCodePoint); -#endif // This calculation comes from the Unicode specification, Table 3-5. // Need to remove the D800 marker from the high surrogate and the DC00 marker from the low surrogate, @@ -47,9 +43,7 @@ public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, /// public static int GetUtf16SequenceLength(uint value) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidScalar(value); -#endif value -= 0x10000; // if value < 0x10000, high byte = 0xFF; else high byte = 0x00 value += (2 << 24); // if value < 0x10000, high byte = 0x01; else high byte = 0x02 @@ -63,9 +57,7 @@ public static int GetUtf16SequenceLength(uint value) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, out char highSurrogateCodePoint, out char lowSurrogateCodePoint) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(value); -#endif // This calculation comes from the Unicode specification, Table 3-5. @@ -78,9 +70,7 @@ public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, ou /// public static int GetUtf8SequenceLength(uint value) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidScalar(value); -#endif // The logic below can handle all valid scalar values branchlessly. // It gives generally good performance across all inputs, and on x86 diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs index a2407957fc20a9..e6febb60ad5c02 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs @@ -96,7 +96,7 @@ static int FromHex(char c) // !! IMPORTANT !! // Don't delete this implementation, as we use it as a reference to make sure the framework's // transcoding logic is correct. -#if !MICROSOFT_BCL_MEMORY +#if NET public #else private From a1caf07c5cd8e20a8746c10faec9c03d592caac9 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 11:14:36 +0700 Subject: [PATCH 07/12] Space --- .../Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj index 67eac34a58cd01..50507bb2d97b6d 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj @@ -46,7 +46,7 @@ - + From 695e3622ea6f300411721765e48240be38ff16c5 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 11:17:24 +0700 Subject: [PATCH 08/12] Remove MICROSOFT_BCL_MEMORY from project constants Removed the `DefineConstants` property from the project file, which included the constant `MICROSOFT_BCL_MEMORY`. This change may impact conditional compilation within the project. --- .../Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj | 1 - 1 file changed, 1 deletion(-) diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index 729dbbdd3460c4..e2be608f30e980 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -3,7 +3,6 @@ $(NetFrameworkMinimum);$(NetCoreAppCurrent) true - $(DefineConstants);MICROSOFT_BCL_MEMORY From 063359adc5d6eb90cee8dd69fdd25b73539df84e Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 11:23:46 +0700 Subject: [PATCH 09/12] Refactor Utf8Tests for .NET compatibility improvements --- .../System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs index e6febb60ad5c02..628768a368997c 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs @@ -63,7 +63,7 @@ public static byte[] DecodeHex(ReadOnlySpan inputHex) { Assert.Matches(@"^([0-9a-fA-F]{2})*$", inputHex.ToString()); -#if NET5_0_OR_GREATER +#if NET return Convert.FromHexString(inputHex); #else byte[] result = new byte[inputHex.Length / 2]; @@ -813,7 +813,7 @@ private static void ToChars_Test_Core(ReadOnlySpan utf8Input, int destinat } } -#if !NETCOREAPP3_0_OR_GREATER +#if !NET internal readonly struct Rune //: IComparable, IComparable, IEquatable { private readonly uint _value; From b5992c67128d19c37754aa5c34f3005730f4525b Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Sun, 2 Mar 2025 19:03:17 +0200 Subject: [PATCH 10/12] Use `Rune.DecodeFromUtf8` on all frameworks. For downlevel frameworks we add `Rune.cs` to `Microsoft.Bcl.Memory`. (cherry picked from commit 79ee05d67f31c0dd7b218c2cf6b2c864b96ce8db) --- .../src/System/Text/Rune.cs | 4 + .../src/System/Text/Unicode/Utf8.cs | 191 ------------------ 2 files changed, 4 insertions(+), 191 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs index 325d64bb278bb3..4e72eca5fd7566 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs @@ -1490,7 +1490,11 @@ int IComparable.CompareTo(object? obj) return this.CompareTo(other); } +#if SYSTEM_PRIVATE_CORLIB throw new ArgumentException(SR.Arg_MustBeRune); +#else + throw new ArgumentException(); +#endif } } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs index a7044fa03e8155..003bd747b837b9 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -203,11 +203,7 @@ public static unsafe OperationStatus ToUtf16(ReadOnlySpan source, Spantrue if value is well-formed UTF-8, false otherwise. public static bool IsValid(ReadOnlySpan value) => Utf8Utility.GetIndexOfFirstInvalidUtf8Sequence(value, out _) < 0; - -#if !NET - /// - /// Decodes the Rune at the beginning of the provided UTF-8 source buffer. - /// - /// - /// - /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns , - /// and outs via the decoded Runeand via the - /// number of s used in the input buffer to encode the Rune. - /// - /// - /// If the source buffer is empty or contains only a partial UTF-8 subsequence, returns , - /// and outs via ReplacementChar and via the length of the input buffer. - /// - /// - /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns , - /// and outs via ReplacementChar and via the number of - /// s used in the input buffer to encode the ill-formed sequence. - /// - /// - /// - /// The general calling convention is to call this method in a loop, slicing the buffer by - /// elements on each iteration of the loop. On each iteration of the loop - /// will contain the real scalar value if successfully decoded, or it will contain ReplacementChar if - /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of - /// invalid sequences while iterating through the loop. - /// - private static OperationStatus DecodeFromUtf8(ReadOnlySpan source, out uint result, out int bytesConsumed) - { - // This method follows the Unicode Standard's recommendation for detecting - // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, - // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, - // it tries to consume as many code units as possible as long as those code - // units constitute the beginning of a longer well-formed subsequence per Table 3-7. - - // Try reading source[0]. - - int index = 0; - if (source.IsEmpty) - { - goto NeedsMoreData; - } - - uint tempValue = source[0]; - if (UnicodeUtility.IsAsciiCodePoint(tempValue)) - { - bytesConsumed = 1; - result = tempValue; - return OperationStatus.Done; - } - - // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in - // the range [C2..F4]. If it's outside of that range, it's either a standalone - // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range - // four-byte sequence. - - // Try reading source[1]. - - index = 1; - if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) - { - goto Invalid; - } - - tempValue = (tempValue - 0xC2) << 6; - - if (source.Length <= 1) - { - goto NeedsMoreData; - } - - // Continuation bytes are of the form [10xxxxxx], which means that their two's - // complement representation is in the range [-65..-128]. This allows us to - // perform a single comparison to see if a byte is a continuation byte. - - int thisByteSignExtended = (sbyte)source[1]; - if (thisByteSignExtended >= -64) - { - goto Invalid; - } - - tempValue += (uint)thisByteSignExtended; - tempValue += 0x80; // remove the continuation byte marker - tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker - - if (tempValue < 0x0800) - { - Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); - goto Finish; // this is a valid 2-byte sequence - } - - // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have - // enough information (from just two code units) to detect overlong or surrogate - // sequences, we need to perform these checks now. - - if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) - { - // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. - // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. - goto Invalid; - } - - if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) - { - // This is a UTF-16 surrogate code point, which is invalid in UTF-8. - goto Invalid; - } - - if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) - { - // This is an overlong 4-byte sequence. - goto Invalid; - } - - // The first two bytes were just fine. We don't need to perform any other checks - // on the remaining bytes other than to see that they're valid continuation bytes. - - // Try reading source[2]. - - index = 2; - if (source.Length <= 2) - { - goto NeedsMoreData; - } - - thisByteSignExtended = (sbyte)source[2]; - if (thisByteSignExtended >= -64) - { - goto Invalid; // this byte is not a UTF-8 continuation byte - } - - tempValue <<= 6; - tempValue += (uint)thisByteSignExtended; - tempValue += 0x80; // remove the continuation byte marker - tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker - - if (tempValue <= 0xFFFF) - { - Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); - goto Finish; // this is a valid 3-byte sequence - } - - // Try reading source[3]. - - index = 3; - if (source.Length <= 3) - { - goto NeedsMoreData; - } - - thisByteSignExtended = (sbyte)source[3]; - if (thisByteSignExtended >= -64) - { - goto Invalid; // this byte is not a UTF-8 continuation byte - } - - tempValue <<= 6; - tempValue += (uint)thisByteSignExtended; - tempValue += 0x80; // remove the continuation byte marker - tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker - - // Valid 4-byte sequence - UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); - - Finish: - - bytesConsumed = index + 1; - Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] - result = tempValue; - return OperationStatus.Done; - - NeedsMoreData: - - Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 - bytesConsumed = index; - result = (char)UnicodeUtility.ReplacementChar; - return OperationStatus.NeedMoreData; - - Invalid: - - Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 - bytesConsumed = index; - result = (char)UnicodeUtility.ReplacementChar; - return OperationStatus.InvalidData; - } -#endif } } From 90a3613de346d547a6b22b80ef958fd484d72e01 Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Sat, 8 Mar 2025 16:04:04 +0200 Subject: [PATCH 11/12] Fix compile errors. (cherry picked from commit bf6f989f91e88ea75a07e4870faf98a8fb711b88) --- .../src/Microsoft.Bcl.Memory.csproj | 4 +- .../src/System/ThrowHelper.cs | 62 +++++++++++++++++++ .../src/System/Text/Rune.cs | 6 ++ 3 files changed, 71 insertions(+), 1 deletion(-) create mode 100644 src/libraries/Microsoft.Bcl.Memory/src/System/ThrowHelper.cs diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj index 50507bb2d97b6d..94483244d3b46a 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj @@ -1,4 +1,4 @@ - + $(NetCoreAppCurrent);$(NetCoreAppPrevious);$(NetCoreAppMinimum);netstandard2.1;netstandard2.0;$(NetFrameworkMinimum) @@ -29,8 +29,10 @@ + + diff --git a/src/libraries/Microsoft.Bcl.Memory/src/System/ThrowHelper.cs b/src/libraries/Microsoft.Bcl.Memory/src/System/ThrowHelper.cs new file mode 100644 index 00000000000000..8b6714cf110379 --- /dev/null +++ b/src/libraries/Microsoft.Bcl.Memory/src/System/ThrowHelper.cs @@ -0,0 +1,62 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Diagnostics; +using System.Diagnostics.CodeAnalysis; + +namespace System +{ + internal static class ThrowHelper + { + [DoesNotReturn] + internal static void ThrowArgumentException_DestinationTooShort() + { + throw new ArgumentException(SR.Argument_DestinationTooShort, "destination"); + } + + [DoesNotReturn] + internal static void ThrowArgumentNullException(ExceptionArgument argument) + { + throw new ArgumentNullException(GetArgumentName(argument)); + } + + [DoesNotReturn] + internal static void ThrowArgumentOutOfRangeException(ExceptionArgument argument) + { + throw new ArgumentOutOfRangeException(GetArgumentName(argument)); + } + + private static string GetArgumentName(ExceptionArgument argument) + { + switch (argument) + { + case ExceptionArgument.ch: + return nameof(ExceptionArgument.ch); + case ExceptionArgument.culture: + return nameof(ExceptionArgument.culture); + case ExceptionArgument.index: + return nameof(ExceptionArgument.index); + case ExceptionArgument.input: + return nameof(ExceptionArgument.input); + case ExceptionArgument.value: + return nameof(ExceptionArgument.value); + default: + Debug.Fail("The enum value is not defined, please check the ExceptionArgument Enum."); + return ""; + + } + } + } + + // + // The convention for this enum is using the argument name as the enum name + // + internal enum ExceptionArgument + { + ch, + culture, + index, + input, + value, + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs index 4e72eca5fd7566..9325bc14380b42 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs @@ -781,6 +781,7 @@ public int EncodeToUtf8(Span destination) public override int GetHashCode() => Value; +#if !MICROSOFT_BCL_MEMORY /// /// Gets the which begins at index in /// string . @@ -799,6 +800,7 @@ public static Rune GetRuneAt(string input, int index) return UnsafeCreate((uint)runeValue); } +#endif /// /// Returns iff is a valid Unicode scalar @@ -850,6 +852,7 @@ internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan input) return (int)returnValue; } +#if !MICROSOFT_BCL_MEMORY // returns a negative number on failure private static int ReadRuneFromString(string input, int index) { @@ -897,6 +900,7 @@ private static int ReadRuneFromString(string input, int index) return (int)returnValue; } +#endif /// /// Returns a representation of this instance. @@ -1128,6 +1132,7 @@ private static bool TryEncodeToUtf8(Rune value, Span destination, out int return false; } +#if !MICROSOFT_BCL_MEMORY /// /// Attempts to get the which begins at index in /// string . @@ -1151,6 +1156,7 @@ public static bool TryGetRuneAt(string input, int index, out Rune value) return false; } } +#endif // Allows constructing a Unicode scalar value from an arbitrary 32-bit integer without // validation. It is the caller's responsibility to have performed manual validation From 412071a511179b77a8854d8a71259ed1be1b796c Mon Sep 17 00:00:00 2001 From: Theodore Tsirpanis Date: Fri, 18 Apr 2025 01:36:35 +0300 Subject: [PATCH 12/12] Address PR feedback. (cherry picked from commit 445a2325c937eea57f4acea30ed63fae5a2dce13) --- .../src/Microsoft.Bcl.Memory.csproj | 2 +- .../src/System/Text/Rune.cs | 6 ++--- .../src/System/Text/Unicode/Utf8.cs | 9 ++----- .../System/Text/Unicode/Utf8Tests.cs | 26 ++++--------------- 4 files changed, 11 insertions(+), 32 deletions(-) diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj index 94483244d3b46a..7aac2e95bd52e4 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj @@ -5,7 +5,7 @@ true $(DefineConstants);MICROSOFT_BCL_MEMORY true - Provides Base64Url, Utf8, Index and Range types support for .NET Framework and .NET Standard. + Provides Base64Url, Utf8, Index, and Range types support for .NET Framework and .NET Standard. diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs index 9325bc14380b42..66b6c93c401213 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Rune.cs @@ -781,7 +781,7 @@ public int EncodeToUtf8(Span destination) public override int GetHashCode() => Value; -#if !MICROSOFT_BCL_MEMORY +#if SYSTEM_PRIVATE_CORELIB /// /// Gets the which begins at index in /// string . @@ -852,7 +852,7 @@ internal static int ReadFirstRuneFromUtf16Buffer(ReadOnlySpan input) return (int)returnValue; } -#if !MICROSOFT_BCL_MEMORY +#if SYSTEM_PRIVATE_CORELIB // returns a negative number on failure private static int ReadRuneFromString(string input, int index) { @@ -1132,7 +1132,7 @@ private static bool TryEncodeToUtf8(Rune value, Span destination, out int return false; } -#if !MICROSOFT_BCL_MEMORY +#if SYSTEM_PRIVATE_CORELIB /// /// Attempts to get the which begins at index in /// string . diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs index 003bd747b837b9..e1aa47dc57bd72 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -9,15 +9,10 @@ namespace System.Text.Unicode { -#if SYSTEM_PRIVATE_CORELIB || MICROSOFT_BCL_MEMORY /// - /// Provides methods for transcoding between UTF-8 and UTF-16. + /// Provides static methods that convert chunked data between UTF-8 and UTF-16 encodings. /// - public -#else - internal -#endif - static class Utf8 + public static class Utf8 { /* * OperationStatus-based APIs for transcoding of chunked data. diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs index 628768a368997c..0b160ac245f0fd 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs @@ -75,33 +75,17 @@ public static byte[] DecodeHex(ReadOnlySpan inputHex) } return result; - static int FromHex(char c) - { - if (c >= '0' && c <= '9') - { - return c - '0'; - } - else if (c >= 'a' && c <= 'f') - { - return c - 'a' + 10; - } - else - { - return c - 'A' + 10; - } - } + static int FromHex(char c) => + c >= '0' && c <= '9' ? c - '0' : + c >= 'a' && c <= 'f' ? c - 'a' + 10 : + c - 'A' + 10; #endif } // !! IMPORTANT !! // Don't delete this implementation, as we use it as a reference to make sure the framework's // transcoding logic is correct. -#if NET - public -#else - private -#endif - static byte[] ToUtf8(Rune rune) + private static byte[] ToUtf8(Rune rune) { Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed.");