From c043cbdd6b410e51c8a1c35ca39fa03a877e6e90 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sat, 11 Jan 2025 06:11:10 +0700 Subject: [PATCH 1/9] Add support for System.Text.Unicode.Utf8 This commit introduces the `System.Text.Unicode.Utf8` type to the `Microsoft.Bcl.Memory` library. It includes type forwarding for `Utf8` in `Microsoft.Bcl.Memory.Forwards.cs`, updates the documentation in `PACKAGE.md` to include `Utf8` functionality, and adds corresponding test cases in `Microsoft.Bcl.Memory.Tests.csproj`. The documentation now emphasizes `Utf8` alongside `Index`, `Range`, and `Base64Url`, highlighting its role in converting data between UTF-8 and UTF-16 encodings. --- .../src/Microsoft.Bcl.Memory.Forwards.cs | 3 +++ src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md | 4 ++++ .../tests/Microsoft.Bcl.Memory.Tests.csproj | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs index bf96db3ece9e41..e3fdeb43392ee0 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs @@ -3,6 +3,9 @@ [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Index))] [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Range))] +#if NET8_0_OR_GREATER +[assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Text.Unicode.Utf8))] +#endif #if NET9_0_OR_GREATER [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Buffers.Text.Base64Url))] #endif diff --git a/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md b/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md index eac081364db99d..af99a7b334b050 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md +++ b/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md @@ -2,6 +2,7 @@ Provides `Index` and `Range` types to simplify slicing operations on collections for .NET Framework and .NET Standard 2.0. Provides `Base64Url` for encoding data in a URL-safe manner on .NET Framework and .NET Standard. +Provides `Utf8` for converting chunked data between UTF-8 and UTF-16 encodings on .NET Framework and .NET Standard. This library is not necessary nor recommended when targeting versions of .NET that include the relevant support. @@ -11,6 +12,7 @@ This library is not necessary nor recommended when targeting versions of .NET th * Enables the use of `Index` and `Range` types on older .NET platforms. * Provides `Base64Url` encoding, decoding, and validation for URL-safe data processing on older .NET platforms. +* Provides `Utf8` encoding, decoding, and validation for chunked data between UTF-8 and UTF-16 on older .NET platforms. ## How to Use @@ -64,6 +66,7 @@ The main types provided by this library are: * `System.Index` * `System.Range` * `System.Buffers.Text.Base64Url` +* `System.Text.Unicode.Utf8` ## Additional Documentation @@ -74,6 +77,7 @@ API documentation * [System.Index](https://learn.microsoft.com/dotnet/api/system.index) * [System.Range](https://learn.microsoft.com/dotnet/api/system.range) * [System.Buffers.Text.Base64Url](https://learn.microsoft.com/dotnet/api/system.buffers.text.base64url) +* [System.Text.Unicode.Utf8](https://learn.microsoft.com/dotnet/api/system.text.unicode.utf8) ## Feedback & Contributing diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index b7458269bedff7..8aea9932efadde 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -32,6 +32,13 @@ + + + + System\Text\Unicode\Utf8Tests.cs + + + From f60df7c2558efb4217cfbdd4cd6f56ff97da47f6 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sat, 11 Jan 2025 19:49:41 +0700 Subject: [PATCH 2/9] Enhance UTF-8 and Unicode support in BCL Memory Updated `PackageDescription` to include "Utf8" support. Added new `ItemGroup` for conditional compilation of UTF-8 and Unicode handling files for non-net8.0 frameworks. Modified visibility and implementations in `Ascii.Utility.Helpers.cs`, `Utf8.cs`, and `Utf8Utility` based on `MICROSOFT_BCL_MEMORY` define. --- .../src/Microsoft.Bcl.Memory.csproj | 14 +- .../src/System/Text/Ascii.Utility.Helpers.cs | 45 +++- .../src/System/Text/Ascii.Utility.cs | 43 +++- .../src/System/Text/Unicode/Utf8.cs | 198 +++++++++++++++++- .../Text/Unicode/Utf8Utility.Helpers.cs | 4 + .../Text/Unicode/Utf8Utility.Transcoding.cs | 10 + .../Text/Unicode/Utf8Utility.Validation.cs | 16 +- .../src/System/Text/Unicode/Utf8Utility.cs | 4 + .../src/System/Text/UnicodeUtility.cs | 10 + 9 files changed, 336 insertions(+), 8 deletions(-) diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj index c542c078d20cb6..4f9a741965fd28 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj @@ -5,7 +5,7 @@ true $(DefineConstants);MICROSOFT_BCL_MEMORY true - Provides Base64Url, Index and Range types support for .NET Framework and .NET Standard. + Provides Base64Url, Utf8, Index and Range types support for .NET Framework and .NET Standard. @@ -27,6 +27,18 @@ + + + + + + + + + + + + diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs index 30467a1843a32a..5d22d8a29f5f6b 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs @@ -4,10 +4,16 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; namespace System.Text { - public static partial class Ascii +#if !MICROSOFT_BCL_MEMORY + public +#else + internal +#endif + static partial class Ascii { /// /// A mask which selects only the high bit of each byte of the given . @@ -44,7 +50,11 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat if (BitConverter.IsLittleEndian) { +#if !MICROSOFT_BCL_MEMORY return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; +#else + return (uint)TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; +#endif } else { @@ -60,22 +70,55 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat // expensive. Instead we'll just change how we perform the shifts. // Read first byte +#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 1); +#else + value = (value << 1) | (value >> (32 - 1)); +#endif uint allBytesUpToNowAreAscii = value & 1; uint numAsciiBytes = allBytesUpToNowAreAscii; // Read second byte +#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 8); +#else + value = (value << 8) | (value >> (32 - 8)); +#endif allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; // Read third byte +#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 8); +#else + value = (value << 8) | (value >> (32 - 8)); +#endif allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; return numAsciiBytes; } } + +#if MICROSOFT_BCL_MEMORY + private static ReadOnlySpan TrailingZeroCountDeBruijn => // 32 + [ + 00, 01, 28, 02, 29, 14, 24, 03, + 30, 22, 20, 15, 25, 17, 04, 08, + 31, 27, 13, 23, 21, 19, 16, 07, + 26, 12, 18, 06, 11, 05, 10, 09 + ]; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int TrailingZeroCount(uint value) + { + // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check + return Unsafe.AddByteOffset( + // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u + ref MemoryMarshal.GetReference(TrailingZeroCountDeBruijn), + // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here + (IntPtr)(int)(((value & (uint)-(int)value) * 0x077CB531u) >> 27)); // Multi-cast mitigates redundant conv.u8 + } +#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 580550467fe589..61fe8e4a55df3e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -4,14 +4,21 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +#if !MICROSOFT_BCL_MEMORY using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Wasm; using System.Runtime.Intrinsics.X86; +#endif namespace System.Text { - public static partial class Ascii +#if !MICROSOFT_BCL_MEMORY + public +#else + internal +#endif + static partial class Ascii { /// /// Returns iff all bytes in are ASCII. @@ -53,6 +60,7 @@ private static bool AllCharsInUInt64AreAscii(ulong value) : AllCharsInUInt64AreAscii(value); } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128 value, Vector128 bitmask) @@ -75,6 +83,7 @@ private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128 val Debug.Assert((mask != 0) ? index < 16 : index >= 16); return index; } +#endif /// /// Given a DWORD which represents two packed chars in machine-endian order, @@ -102,6 +111,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. +#if !MICROSOFT_BCL_MEMORY if (!Vector512.IsHardwareAccelerated && !Vector256.IsHardwareAccelerated && (Sse2.IsSupported || AdvSimd.IsSupported)) @@ -109,7 +119,9 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu return GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength); } else +#endif { + // Handles Vector512, Vector256, Vector128, and scalar. return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength); } @@ -128,6 +140,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu // Note use of SBYTE instead of BYTE below; we're using the two's-complement // representation of negative integers to act as a surrogate for "is ASCII?". +#if !MICROSOFT_BCL_MEMORY if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) { if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0) @@ -236,6 +249,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu bufferLength += (nuint)pOriginalBuffer; } } +#endif // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code @@ -332,6 +346,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu goto Finish; } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool ContainsNonAsciiByte_Sse2(uint sseMask) { @@ -702,6 +717,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuff goto Finish; } +#endif /// /// Returns the index in where the first non-ASCII char is found. @@ -716,6 +732,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bu // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. +#if !MICROSOFT_BCL_MEMORY if (!Vector512.IsHardwareAccelerated && !Vector256.IsHardwareAccelerated && (Sse2.IsSupported || AdvSimd.IsSupported)) @@ -723,6 +740,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bu return GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength); } else +#endif { // Handles Vector512, Vector256, Vector128, and scalar. return GetIndexOfFirstNonAsciiChar_Vector(pBuffer, bufferLength); @@ -740,6 +758,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char)); #endif +#if !MICROSOFT_BCL_MEMORY // Before we drain off char-by-char, try a generic vectorized loop. // Only run the loop if we have at least two vectors we can pull out. if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) @@ -849,7 +868,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char); } } - +#endif // At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform // a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code @@ -932,6 +951,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu goto Finish; } +#if !MICROSOFT_BCL_MEMORY private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuffer, nuint bufferLength /* in chars */) { // This method contains logic optimized using vector instructions for both x64 and Arm64. @@ -1235,6 +1255,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff goto Finish; } +#endif /// /// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order, @@ -1246,6 +1267,7 @@ private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputB { Debug.Assert(AllCharsInUInt64AreAscii(value)); +#if !MICROSOFT_BCL_MEMORY if (Sse2.X64.IsSupported) { // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes @@ -1264,8 +1286,8 @@ private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputB Vector64 lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(vecWide); Unsafe.WriteUnaligned(ref outputBuffer, lower.AsUInt32().ToScalar()); } - else +#endif { if (BitConverter.IsLittleEndian) { @@ -1325,6 +1347,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0; ulong utf16Data64Bits = 0; +#if !MICROSOFT_BCL_MEMORY if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector128.Count) { // Since there's overhead to setting up the vectorized code path, we only want to @@ -1361,6 +1384,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii currentOffset = NarrowUtf16ToAscii_Intrinsified(pUtf16Buffer, pAsciiBuffer, elementCount); } } +#endif Debug.Assert(currentOffset <= elementCount); nuint remainingElementCount = elementCount - currentOffset; @@ -1496,6 +1520,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii goto Finish; } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) { @@ -2032,6 +2057,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff goto Finish; } +#endif /// /// Copies as many ASCII bytes (00..7F) as possible from @@ -2044,6 +2070,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B // Intrinsified in mono interpreter nuint currentOffset = 0; +#if !MICROSOFT_BCL_MEMORY if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128.Count) { if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512.Count) @@ -2059,6 +2086,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B WidenAsciiToUtf1_Vector, Vector128>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount); } } +#endif Debug.Assert(currentOffset <= elementCount); nuint remainingElementCount = elementCount - currentOffset; @@ -2149,7 +2177,11 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B { while ((asciiData & 0x80000000) == 0) { +#if !MICROSOFT_BCL_MEMORY asciiData = BitOperations.RotateLeft(asciiData, 8); +#else + asciiData = (asciiData << 8) | (asciiData >> (32 - 8)); ; +#endif pUtf16Buffer[currentOffset] = (char)(byte)asciiData; currentOffset++; } @@ -2158,6 +2190,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B goto Finish; } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void WidenAsciiToUtf1_Vector(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount) where TVectorByte : unmanaged, ISimdVector @@ -2235,7 +2268,7 @@ private static (TVectorUInt16 Lower, TVectorUInt16 Upper) Widen /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and @@ -2246,6 +2279,7 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB { Debug.Assert(AllBytesInUInt32AreAscii(value)); +#if !MICROSOFT_BCL_MEMORY if (AdvSimd.Arm64.IsSupported) { Vector128 vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte(); @@ -2259,6 +2293,7 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB Unsafe.WriteUnaligned(ref Unsafe.As(ref outputBuffer), vecWide.ToScalar()); } else +#endif { if (BitConverter.IsLittleEndian) { diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs index cb4cd8bde42216..630d8db69ede89 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -9,7 +9,10 @@ namespace System.Text.Unicode { -#if SYSTEM_PRIVATE_CORELIB +#if SYSTEM_PRIVATE_CORELIB || MICROSOFT_BCL_MEMORY + /// + /// Provides methods for transcoding between UTF-8 and UTF-16. + /// public #else internal @@ -200,7 +203,11 @@ public static unsafe OperationStatus ToUtf16(ReadOnlySpan source, Span source, Span source, Span destination, out int bytesRead, out int charsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { // NOTE: Changes to this method should be kept in sync with ToUtf16 above. @@ -807,6 +815,7 @@ private bool Fail() return false; } } +#endif /// /// Validates that the value is well-formed UTF-8. @@ -815,5 +824,192 @@ private bool Fail() /// true if value is well-formed UTF-8, false otherwise. public static bool IsValid(ReadOnlySpan value) => Utf8Utility.GetIndexOfFirstInvalidUtf8Sequence(value, out _) < 0; + +#if MICROSOFT_BCL_MEMORY + /// + /// Decodes the Rune at the beginning of the provided UTF-8 source buffer. + /// + /// + /// + /// If the source buffer begins with a valid UTF-8 encoded scalar value, returns , + /// and outs via the decoded Runeand via the + /// number of s used in the input buffer to encode the Rune. + /// + /// + /// If the source buffer is empty or contains only a partial UTF-8 subsequence, returns , + /// and outs via ReplacementChar and via the length of the input buffer. + /// + /// + /// If the source buffer begins with an ill-formed UTF-8 encoded scalar value, returns , + /// and outs via ReplacementChar and via the number of + /// s used in the input buffer to encode the ill-formed sequence. + /// + /// + /// + /// The general calling convention is to call this method in a loop, slicing the buffer by + /// elements on each iteration of the loop. On each iteration of the loop + /// will contain the real scalar value if successfully decoded, or it will contain ReplacementChar if + /// the data could not be successfully decoded. This pattern provides convenient automatic U+FFFD substitution of + /// invalid sequences while iterating through the loop. + /// + private static OperationStatus DecodeFromUtf8(ReadOnlySpan source, out uint result, out int bytesConsumed) + { + // This method follows the Unicode Standard's recommendation for detecting + // the maximal subpart of an ill-formed subsequence. See The Unicode Standard, + // Ch. 3.9 for more details. In summary, when reporting an invalid subsequence, + // it tries to consume as many code units as possible as long as those code + // units constitute the beginning of a longer well-formed subsequence per Table 3-7. + + // Try reading source[0]. + + int index = 0; + if (source.IsEmpty) + { + goto NeedsMoreData; + } + + uint tempValue = source[0]; + if (UnicodeUtility.IsAsciiCodePoint(tempValue)) + { + bytesConsumed = 1; + result = tempValue; + return OperationStatus.Done; + } + + // Per Table 3-7, the beginning of a multibyte sequence must be a code unit in + // the range [C2..F4]. If it's outside of that range, it's either a standalone + // continuation byte, or it's an overlong two-byte sequence, or it's an out-of-range + // four-byte sequence. + + // Try reading source[1]. + + index = 1; + if (!UnicodeUtility.IsInRangeInclusive(tempValue, 0xC2, 0xF4)) + { + goto Invalid; + } + + tempValue = (tempValue - 0xC2) << 6; + + if (source.Length <= 1) + { + goto NeedsMoreData; + } + + // Continuation bytes are of the form [10xxxxxx], which means that their two's + // complement representation is in the range [-65..-128]. This allows us to + // perform a single comparison to see if a byte is a continuation byte. + + int thisByteSignExtended = (sbyte)source[1]; + if (thisByteSignExtended >= -64) + { + goto Invalid; + } + + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue += (0xC2 - 0xC0) << 6; // remove the leading byte marker + + if (tempValue < 0x0800) + { + Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0080, 0x07FF)); + goto Finish; // this is a valid 2-byte sequence + } + + // This appears to be a 3- or 4-byte sequence. Since per Table 3-7 we now have + // enough information (from just two code units) to detect overlong or surrogate + // sequences, we need to perform these checks now. + + if (!UnicodeUtility.IsInRangeInclusive(tempValue, ((0xE0 - 0xC0) << 6) + (0xA0 - 0x80), ((0xF4 - 0xC0) << 6) + (0x8F - 0x80))) + { + // The first two bytes were not in the range [[E0 A0]..[F4 8F]]. + // This is an overlong 3-byte sequence or an out-of-range 4-byte sequence. + goto Invalid; + } + + if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xED - 0xC0) << 6) + (0xA0 - 0x80), ((0xED - 0xC0) << 6) + (0xBF - 0x80))) + { + // This is a UTF-16 surrogate code point, which is invalid in UTF-8. + goto Invalid; + } + + if (UnicodeUtility.IsInRangeInclusive(tempValue, ((0xF0 - 0xC0) << 6) + (0x80 - 0x80), ((0xF0 - 0xC0) << 6) + (0x8F - 0x80))) + { + // This is an overlong 4-byte sequence. + goto Invalid; + } + + // The first two bytes were just fine. We don't need to perform any other checks + // on the remaining bytes other than to see that they're valid continuation bytes. + + // Try reading source[2]. + + index = 2; + if (source.Length <= 2) + { + goto NeedsMoreData; + } + + thisByteSignExtended = (sbyte)source[2]; + if (thisByteSignExtended >= -64) + { + goto Invalid; // this byte is not a UTF-8 continuation byte + } + + tempValue <<= 6; + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue -= (0xE0 - 0xC0) << 12; // remove the leading byte marker + + if (tempValue <= 0xFFFF) + { + Debug.Assert(UnicodeUtility.IsInRangeInclusive(tempValue, 0x0800, 0xFFFF)); + goto Finish; // this is a valid 3-byte sequence + } + + // Try reading source[3]. + + index = 3; + if (source.Length <= 3) + { + goto NeedsMoreData; + } + + thisByteSignExtended = (sbyte)source[3]; + if (thisByteSignExtended >= -64) + { + goto Invalid; // this byte is not a UTF-8 continuation byte + } + + tempValue <<= 6; + tempValue += (uint)thisByteSignExtended; + tempValue += 0x80; // remove the continuation byte marker + tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker + + // Valid 4-byte sequence + //UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); + + Finish: + + bytesConsumed = index + 1; + Debug.Assert(1 <= bytesConsumed && bytesConsumed <= 4); // Valid subsequences are always length [1..4] + result = tempValue; + return OperationStatus.Done; + + NeedsMoreData: + + Debug.Assert(0 <= index && index <= 3); // Incomplete subsequences are always length 0..3 + bytesConsumed = index; + result = (char)UnicodeUtility.ReplacementChar; + return OperationStatus.NeedMoreData; + + Invalid: + + Debug.Assert(1 <= index && index <= 3); // Invalid subsequences are always length 1..3 + bytesConsumed = index; + result = (char)UnicodeUtility.ReplacementChar; + return OperationStatus.InvalidData; + } +#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs index b615ee5edd0ad1..25f1f130a01fc8 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -115,7 +115,11 @@ private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value) value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] +#if !MICROSOFT_BCL_MEMORY tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] +#else + tempA = (tempA << 16) | (tempA >> (32 - 16)); ; // = [ 00xxxxxx 00000000 00000000 00000uuu ] +#endif uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs index 21645204289880..e11572e43c1afa 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -6,9 +6,11 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +#if !MICROSOFT_BCL_MEMORY using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; +#endif namespace System.Text.Unicode { @@ -598,7 +600,11 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ]. +#if !MICROSOFT_BCL_MEMORY toCheck = BitOperations.RotateRight(toCheck, 8); +#else + toCheck = (toCheck >> 8) | (toCheck << (32 - 8)); +#endif // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ]. @@ -878,12 +884,14 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt // vector is only used in those code paths, we leave it uninitialized if SSE4.1 // is not enabled. +#if !MICROSOFT_BCL_MEMORY Vector128 nonAsciiUtf16DataMask; if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) { nonAsciiUtf16DataMask = Vector128.Create(unchecked((short)0xFF80)); // mask of non-ASCII bits in a UTF-16 char } +#endif // Begin the main loop. @@ -938,6 +946,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2; uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); +#if !MICROSOFT_BCL_MEMORY if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) { // Try reading and writing 8 elements per iteration. @@ -1081,6 +1090,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt goto AfterReadDWordSkipAllCharsAsciiCheck; } else +#endif { // Can't use SSE41 x64, so we'll only read and write 4 elements per iteration. uint maxIters = minElementsRemaining / 4; diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index a542dad72b5c33..a36f64066602ec 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -5,9 +5,11 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; +#if !MICROSOFT_BCL_MEMORY using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; +#endif namespace System.Text.Unicode { @@ -113,13 +115,14 @@ internal static unsafe partial class Utf8Utility // the alignment check consumes at most a single DWORD.) byte* pInputBufferFinalPosAtWhichCanSafelyLoop = pFinalPosWhereCanReadDWordFromInputBuffer - 3 * sizeof(uint); // can safely read 4 DWORDs here - nuint trailingZeroCount; // pInputBuffer is 32-bit aligned but not necessary 128-bit aligned, so we're // going to perform an unaligned load. We don't necessarily care about aligning // this because we pessimistically assume we'll encounter non-ASCII data at some // point in the not-too-distant future (otherwise we would've stayed entirely // within the all-ASCII vectorized code at the entry to this method). +#if !MICROSOFT_BCL_MEMORY + nuint trailingZeroCount; if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) { // declare bitMask128 inside of the AdvSimd.Arm64.IsSupported check @@ -140,9 +143,11 @@ internal static unsafe partial class Utf8Utility } while (pInputBuffer <= pInputBufferFinalPosAtWhichCanSafelyLoop); } else +#endif { do { +#if !MICROSOFT_BCL_MEMORY if (Sse2.IsSupported) { uint mask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pInputBuffer)); @@ -153,6 +158,7 @@ internal static unsafe partial class Utf8Utility } } else +#endif { if (!Ascii.AllBytesInUInt32AreAscii(((uint*)pInputBuffer)[0] | ((uint*)pInputBuffer)[1])) { @@ -171,6 +177,7 @@ internal static unsafe partial class Utf8Utility continue; // need to perform a bounds check because we might be running out of data +#if !MICROSOFT_BCL_MEMORY LoopTerminatedEarlyDueToNonAsciiData: // x86 can only be little endian, while ARM can be big or little endian // so if we reached this label we need to check both combinations are supported @@ -192,6 +199,7 @@ internal static unsafe partial class Utf8Utility thisDWord = Unsafe.ReadUnaligned(pInputBuffer); // no longer guaranteed to be aligned goto BeforeProcessTwoByteSequence; +#endif LoopTerminatedEarlyDueToNonAsciiDataInSecondPair: @@ -597,7 +605,11 @@ internal static unsafe partial class Utf8Utility // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). +#if !MICROSOFT_BCL_MEMORY thisDWord = BitOperations.RotateRight(thisDWord, 8); +#else + thisDWord = (thisDWord >> 8) | (thisDWord << (32 - 8)); +#endif // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ]. // The check is now a simple add / cmp / jcc combo. @@ -739,6 +751,7 @@ internal static unsafe partial class Utf8Utility return pInputBuffer; } +#if !MICROSOFT_BCL_MEMORY [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bitMask128) @@ -753,5 +766,6 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); return extractedBits.AsUInt64().ToScalar(); } +#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs index 9708c32e49e24d..ed26341a9e67f7 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs @@ -4,7 +4,9 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +#if !MICROSOFT_BCL_MEMORY using System.Runtime.Intrinsics; +#endif namespace System.Text.Unicode { @@ -251,6 +253,7 @@ internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB) return (valueA | letterMaskA) == (valueB | letterMaskB); } +#if !MICROSOFT_BCL_MEMORY /// /// Returns true iff the Vector128 represents 16 ASCII UTF-8 characters in machine endianness. /// @@ -288,5 +291,6 @@ internal static bool Vector128OrdinalIgnoreCaseAscii(Vector128 vec1, Vecto // Compare two lowercased vectors return (lcVec1 ^ lcVec2) == Vector128.Zero; } +#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs index eeccfc57597920..af179916fb8c99 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs @@ -17,7 +17,9 @@ internal static class UnicodeUtility /// public static int GetPlane(uint codePoint) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidCodePoint(codePoint); +#endif return (int)(codePoint >> 16); } @@ -27,8 +29,10 @@ public static int GetPlane(uint codePoint) /// public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, uint lowSurrogateCodePoint) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsHighSurrogateCodePoint(highSurrogateCodePoint); UnicodeDebug.AssertIsLowSurrogateCodePoint(lowSurrogateCodePoint); +#endif // This calculation comes from the Unicode specification, Table 3-5. // Need to remove the D800 marker from the high surrogate and the DC00 marker from the low surrogate, @@ -43,7 +47,9 @@ public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, /// public static int GetUtf16SequenceLength(uint value) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidScalar(value); +#endif value -= 0x10000; // if value < 0x10000, high byte = 0xFF; else high byte = 0x00 value += (2 << 24); // if value < 0x10000, high byte = 0x01; else high byte = 0x02 @@ -57,7 +63,9 @@ public static int GetUtf16SequenceLength(uint value) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, out char highSurrogateCodePoint, out char lowSurrogateCodePoint) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(value); +#endif // This calculation comes from the Unicode specification, Table 3-5. @@ -70,7 +78,9 @@ public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, ou /// public static int GetUtf8SequenceLength(uint value) { +#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidScalar(value); +#endif // The logic below can handle all valid scalar values branchlessly. // It gives generally good performance across all inputs, and on x86 From 5b86cd6588558d60c3f4ad389315cda10319d720 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sat, 11 Jan 2025 21:49:51 +0700 Subject: [PATCH 3/9] Enhance Unicode handling in tests and project structure Updated `Microsoft.Bcl.Memory.Tests.csproj` to include `UnicodeUtility.cs` and removed .NET 8.0 targeting condition. Modified `Utf8Tests.cs` by adjusting using directives and enhancing the `DecodeHex` method with conditional compilation for .NET 5.0+. --- .../tests/Microsoft.Bcl.Memory.Tests.csproj | 5 +- .../System/Text/Unicode/Utf8Tests.cs | 222 +++++++++++++++++- 2 files changed, 220 insertions(+), 7 deletions(-) diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index 8aea9932efadde..6d3ad15b117bf5 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -30,10 +30,7 @@ System\IndexTests.cs - - - - + System\Text\Unicode\Utf8Tests.cs diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs index a24438dcbb59f9..4a997b6313f94b 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs @@ -2,10 +2,9 @@ // The .NET Foundation licenses this file to you under the MIT license. using System.Buffers; +using System.Collections; using System.Collections.Generic; -using System.Globalization; using System.Linq; -using System.Text.RegularExpressions; using Xunit; namespace System.Text.Unicode.Tests @@ -64,13 +63,45 @@ public static byte[] DecodeHex(ReadOnlySpan inputHex) { Assert.Matches(@"^([0-9a-fA-F]{2})*$", inputHex.ToString()); +#if NET5_0_OR_GREATER return Convert.FromHexString(inputHex); +#else + byte[] result = new byte[inputHex.Length / 2]; + for (int i = 0; i < result.Length; i++) + { + var h = FromHex(inputHex[i * 2]); + var l = FromHex(inputHex[i * 2 + 1]); + result[i] = (byte)((h << 4) | l); + } + return result; + + static int FromHex(char c) + { + if (c >= '0' && c <= '9') + { + return c - '0'; + } + else if (c >= 'a' && c <= 'f') + { + return c - 'a' + 10; + } + else + { + return c - 'A' + 10; + } + } +#endif } // !! IMPORTANT !! // Don't delete this implementation, as we use it as a reference to make sure the framework's // transcoding logic is correct. - public static byte[] ToUtf8(Rune rune) +#if !MICROSOFT_BCL_MEMORY + public +#else + private +#endif + static byte[] ToUtf8(Rune rune) { Assert.True(Rune.IsValid(rune.Value), $"Rune with value U+{(uint)rune.Value:X4} is not well-formed."); @@ -781,4 +812,189 @@ private static void ToChars_Test_Core(ReadOnlySpan utf8Input, int destinat } } } + +#if !NETCOREAPP3_0_OR_GREATER + internal readonly struct Rune //: IComparable, IComparable, IEquatable + { + private readonly uint _value; + + public Rune(uint value) + { + //if (!UnicodeUtility.IsValidUnicodeScalar(value)) + //{ + // ThrowHelper.ThrowArgumentOutOfRangeException(ExceptionArgument.value); + //} + _value = value; + } + + public Rune(int value) + : this((uint)value) + { + } + private Rune(uint scalarValue, bool _) + { + //UnicodeDebug.AssertIsValidScalar(scalarValue); + _value = scalarValue; + } + + internal static Rune UnsafeCreate(uint scalarValue) => new Rune(scalarValue, false); + + public bool IsBmp => UnicodeUtility.IsBmpCodePoint(_value); + + public static Rune ReplacementChar => UnsafeCreate(UnicodeUtility.ReplacementChar); + + public int Utf16SequenceLength + { + get + { + int codeUnitCount = UnicodeUtility.GetUtf16SequenceLength(_value); + //Debug.Assert(codeUnitCount > 0 && codeUnitCount <= MaxUtf16CharsPerRune); + return codeUnitCount; + } + } + + public int Utf8SequenceLength + { + get + { + int codeUnitCount = UnicodeUtility.GetUtf8SequenceLength(_value); + //Debug.Assert(codeUnitCount > 0 && codeUnitCount <= MaxUtf8BytesPerRune); + return codeUnitCount; + } + } + + public int Value => (int)_value; + + public static bool IsValid(int value) => IsValid((uint)value); + public static bool IsValid(uint value) => UnicodeUtility.IsValidUnicodeScalar(value); + + public static bool TryGetRuneAt(string input, int index, out Rune value) + { + int runeValue = ReadRuneFromString(input, index); + if (runeValue >= 0) + { + value = UnsafeCreate((uint)runeValue); + return true; + } + else + { + value = default; + return false; + } + } + + private static int ReadRuneFromString(string input, int index) + { + if (input is null) + { + throw new ArgumentNullException(nameof(input)); + } + + if ((uint)index >= (uint)input.Length) + { + throw new ArgumentOutOfRangeException(nameof(index)); + } + + // Optimistically assume input is within BMP. + + uint returnValue = input[index]; + if (UnicodeUtility.IsSurrogateCodePoint(returnValue)) + { + if (!UnicodeUtility.IsHighSurrogateCodePoint(returnValue)) + { + return -1; + } + + // Treat 'returnValue' as the high surrogate. + // + // If this becomes a hot code path, we can skip the below bounds check by reading + // off the end of the string using unsafe code. Since strings are null-terminated, + // we're guaranteed not to read a valid low surrogate, so we'll fail correctly if + // the string terminates unexpectedly. + + index++; + if ((uint)index >= (uint)input.Length) + { + return -1; // not an argument exception - just a "bad data" failure + } + + uint potentialLowSurrogate = input[index]; + if (!UnicodeUtility.IsLowSurrogateCodePoint(potentialLowSurrogate)) + { + return -1; + } + + returnValue = UnicodeUtility.GetScalarFromUtf16SurrogatePair(returnValue, potentialLowSurrogate); + } + + return (int)returnValue; + } + } + + internal static class StringExtensions + { + public static StringRuneEnumerator EnumerateRunes(this string value) => new StringRuneEnumerator(value); + } + + // An enumerator for retrieving System.Text.Rune instances from a System.String. + internal struct StringRuneEnumerator : IEnumerable, IEnumerator + { + private readonly string _string; + private Rune _current; + private int _nextIndex; + + internal StringRuneEnumerator(string value) + { + _string = value; + _current = default; + _nextIndex = 0; + } + + public Rune Current => _current; + + public StringRuneEnumerator GetEnumerator() => this; + + public bool MoveNext() + { + if ((uint)_nextIndex >= _string.Length) + { + // reached the end of the string + _current = default; + return false; + } + + if (!Rune.TryGetRuneAt(_string, _nextIndex, out _current)) + { + // replace invalid sequences with U+FFFD + _current = Rune.ReplacementChar; + } + + // In UTF-16 specifically, invalid sequences always have length 1, which is the same + // length as the replacement character U+FFFD. This means that we can always bump the + // next index by the current scalar's UTF-16 sequence length. This optimization is not + // generally applicable; for example, enumerating scalars from UTF-8 cannot utilize + // this same trick. + + _nextIndex += _current.Utf16SequenceLength; + return true; + } + + object? IEnumerator.Current => _current; + + void IDisposable.Dispose() + { + // no-op + } + + IEnumerator IEnumerable.GetEnumerator() => this; + + IEnumerator IEnumerable.GetEnumerator() => this; + + void IEnumerator.Reset() + { + _current = default; + _nextIndex = 0; + } + } +#endif } From 22be776fc9faa7086ee028944e70dbba92e71b0e Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sat, 11 Jan 2025 21:52:40 +0700 Subject: [PATCH 4/9] Add compilation constant for Microsoft BCL Memory Added a new property `$(DefineConstants);MICROSOFT_BCL_MEMORY` to the project file to define a new compilation constant for the project. --- .../Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj | 1 + 1 file changed, 1 insertion(+) diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index 6d3ad15b117bf5..ea4445668b6858 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -3,6 +3,7 @@ $(NetFrameworkMinimum);$(NetCoreAppCurrent) true + $(DefineConstants);MICROSOFT_BCL_MEMORY From 79214bd9bff06d08f17cfcb906c6782a5c83d450 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 08:29:54 +0700 Subject: [PATCH 5/9] Apply suggestions from code review Co-authored-by: Theodore Tsirpanis --- .../src/System/Text/Ascii.Utility.Helpers.cs | 2 +- .../System.Private.CoreLib/src/System/Text/Ascii.Utility.cs | 5 ++--- .../System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs | 2 +- .../System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs | 2 +- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs index 5d22d8a29f5f6b..15cf86b085a157 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs @@ -8,7 +8,7 @@ namespace System.Text { -#if !MICROSOFT_BCL_MEMORY +#if SYSTEM_PRIVATE_CORELIB public #else internal diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 61fe8e4a55df3e..69bf2f5f308fc0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -13,7 +13,7 @@ namespace System.Text { -#if !MICROSOFT_BCL_MEMORY +#if SYSTEM_PRIVATE_CORELIB public #else internal @@ -60,7 +60,7 @@ private static bool AllCharsInUInt64AreAscii(ulong value) : AllCharsInUInt64AreAscii(value); } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128 value, Vector128 bitmask) @@ -121,7 +121,6 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu else #endif { - // Handles Vector512, Vector256, Vector128, and scalar. return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength); } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs index 630d8db69ede89..0a89968503a93d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -987,7 +987,7 @@ private static OperationStatus DecodeFromUtf8(ReadOnlySpan source, out uin tempValue -= (0xF0 - 0xE0) << 18; // remove the leading byte marker // Valid 4-byte sequence - //UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); + UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(tempValue); Finish: diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs index 4a997b6313f94b..a2407957fc20a9 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs @@ -833,7 +833,7 @@ public Rune(int value) } private Rune(uint scalarValue, bool _) { - //UnicodeDebug.AssertIsValidScalar(scalarValue); + UnicodeDebug.AssertIsValidScalar(scalarValue); _value = scalarValue; } From def40c463fb428acf834e6427739fba3ec1036f0 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 11:10:09 +0700 Subject: [PATCH 6/9] Update Microsoft.Bcl.Memory for framework compatibility - Added polyfill for System.Numerics.BitOperations for .NET Standard 2.0. --- .../src/Microsoft.Bcl.Memory.Forwards.cs | 2 +- .../src/Microsoft.Bcl.Memory.csproj | 16 ++-- .../Microsoft.Bcl.Memory/src/PACKAGE.md | 8 +- ...em.Numerics.BitOperations.netstandard20.cs | 75 +++++++++++++++++++ .../tests/Microsoft.Bcl.Memory.Tests.csproj | 1 + .../src/System/Text/Ascii.Utility.Helpers.cs | 37 --------- .../src/System/Text/Ascii.Utility.cs | 30 ++++---- .../src/System/Text/Unicode/Utf16Utility.cs | 3 +- .../src/System/Text/Unicode/Utf8.cs | 6 +- .../Text/Unicode/Utf8Utility.Helpers.cs | 4 - .../Text/Unicode/Utf8Utility.Transcoding.cs | 10 +-- .../Text/Unicode/Utf8Utility.Validation.cs | 14 ++-- .../src/System/Text/Unicode/Utf8Utility.cs | 4 +- .../src/System/Text/UnicodeUtility.cs | 10 --- .../System/Text/Unicode/Utf8Tests.cs | 2 +- 15 files changed, 118 insertions(+), 104 deletions(-) create mode 100644 src/libraries/Microsoft.Bcl.Memory/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs index e3fdeb43392ee0..d21745f7321c29 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.Forwards.cs @@ -3,7 +3,7 @@ [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Index))] [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Range))] -#if NET8_0_OR_GREATER +#if NET [assembly: System.Runtime.CompilerServices.TypeForwardedTo(typeof(System.Text.Unicode.Utf8))] #endif #if NET9_0_OR_GREATER diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj index 4f9a741965fd28..67eac34a58cd01 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj @@ -27,16 +27,18 @@ - + + + + + + + + - - - - - @@ -44,7 +46,7 @@ - + diff --git a/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md b/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md index af99a7b334b050..c00e6bfbc7fb9b 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md +++ b/src/libraries/Microsoft.Bcl.Memory/src/PACKAGE.md @@ -1,8 +1,8 @@ ## About Provides `Index` and `Range` types to simplify slicing operations on collections for .NET Framework and .NET Standard 2.0. -Provides `Base64Url` for encoding data in a URL-safe manner on .NET Framework and .NET Standard. -Provides `Utf8` for converting chunked data between UTF-8 and UTF-16 encodings on .NET Framework and .NET Standard. +Provides `Base64Url` for encoding data in a URL-safe manner on older .NET platforms. +Provides `Utf8` for converting chunked data between UTF-8 and UTF-16 encodings on .NET Framework and .NET Standard 2.0. This library is not necessary nor recommended when targeting versions of .NET that include the relevant support. @@ -10,9 +10,9 @@ This library is not necessary nor recommended when targeting versions of .NET th -* Enables the use of `Index` and `Range` types on older .NET platforms. +* Enables the use of `Index` and `Range` types on .NET Framework and .NET Standard 2.0. * Provides `Base64Url` encoding, decoding, and validation for URL-safe data processing on older .NET platforms. -* Provides `Utf8` encoding, decoding, and validation for chunked data between UTF-8 and UTF-16 on older .NET platforms. +* Provides `Utf8` encoding, decoding, and validation for chunked data between UTF-8 and UTF-16 on .NET Framework and .NET Standard 2.0. ## How to Use diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs b/src/libraries/Microsoft.Bcl.Memory/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs new file mode 100644 index 00000000000000..815d1a85732d42 --- /dev/null +++ b/src/libraries/Microsoft.Bcl.Memory/src/Polyfills/System.Numerics.BitOperations.netstandard20.cs @@ -0,0 +1,75 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// Contains a polyfill implementation of System.Numerics.BitOperations that works on netstandard2.0. +// Implementation copied from: +// https://github.com/dotnet/runtime/blob/82ab89241b90ca3d64b22971f3a1e248da72828a/src/libraries/System.Private.CoreLib/src/System/Numerics/BitOperations.cs +// +// Some routines inspired by the Stanford Bit Twiddling Hacks by Sean Eron Anderson: +// http://graphics.stanford.edu/~seander/bithacks.html + +namespace System.Numerics +{ + internal static class BitOperations + { + // C# no-alloc optimization that directly wraps the data section of the dll (similar to string constants) + // https://github.com/dotnet/roslyn/pull/24621 + + private static ReadOnlySpan TrailingZeroCountDeBruijn => // 32 + [ + 00, 01, 28, 02, 29, 14, 24, 03, + 30, 22, 20, 15, 25, 17, 04, 08, + 31, 27, 13, 23, 21, 19, 16, 07, + 26, 12, 18, 06, 11, 05, 10, 09 + ]; + + /// + /// Count the number of trailing zero bits in an integer value. + /// Similar in behavior to the x86 instruction TZCNT. + /// + /// The value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int TrailingZeroCount(uint value) + { + // Unguarded fallback contract is 0->0, BSF contract is 0->undefined + if (value == 0) + { + return 32; + } + + // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check + return Unsafe.AddByteOffset( + // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u + ref MemoryMarshal.GetReference(TrailingZeroCountDeBruijn), + // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here + (IntPtr)(int)(((value & (uint)-(int)value) * 0x077CB531u) >> 27)); // Multi-cast mitigates redundant conv.u8 + } + + /// + /// Rotates the specified value left by the specified number of bits. + /// Similar in behavior to the x86 instruction ROL. + /// + /// The value to rotate. + /// The number of bits to rotate by. + /// Any value outside the range [0..31] is treated as congruent mod 32. + /// The rotated value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static uint RotateLeft(uint value, int offset) + => (value << offset) | (value >> (32 - offset)); + + /// + /// Rotates the specified value right by the specified number of bits. + /// Similar in behavior to the x86 instruction ROR. + /// + /// The value to rotate. + /// The number of bits to rotate by. + /// Any value outside the range [0..31] is treated as congruent mod 32. + /// The rotated value. + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static uint RotateRight(uint value, int offset) + => (value >> offset) | (value << (32 - offset)); + } +} diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index ea4445668b6858..729dbbdd3460c4 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -31,6 +31,7 @@ System\IndexTests.cs + System\Text\Unicode\Utf8Tests.cs diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs index 15cf86b085a157..ed25459c341c49 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.Helpers.cs @@ -50,11 +50,7 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat if (BitConverter.IsLittleEndian) { -#if !MICROSOFT_BCL_MEMORY return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; -#else - return (uint)TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3; -#endif } else { @@ -70,55 +66,22 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat // expensive. Instead we'll just change how we perform the shifts. // Read first byte -#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 1); -#else - value = (value << 1) | (value >> (32 - 1)); -#endif uint allBytesUpToNowAreAscii = value & 1; uint numAsciiBytes = allBytesUpToNowAreAscii; // Read second byte -#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 8); -#else - value = (value << 8) | (value >> (32 - 8)); -#endif allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; // Read third byte -#if !MICROSOFT_BCL_MEMORY value = BitOperations.RotateLeft(value, 8); -#else - value = (value << 8) | (value >> (32 - 8)); -#endif allBytesUpToNowAreAscii &= value; numAsciiBytes += allBytesUpToNowAreAscii; return numAsciiBytes; } } - -#if MICROSOFT_BCL_MEMORY - private static ReadOnlySpan TrailingZeroCountDeBruijn => // 32 - [ - 00, 01, 28, 02, 29, 14, 24, 03, - 30, 22, 20, 15, 25, 17, 04, 08, - 31, 27, 13, 23, 21, 19, 16, 07, - 26, 12, 18, 06, 11, 05, 10, 09 - ]; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int TrailingZeroCount(uint value) - { - // uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check - return Unsafe.AddByteOffset( - // Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u - ref MemoryMarshal.GetReference(TrailingZeroCountDeBruijn), - // uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here - (IntPtr)(int)(((value & (uint)-(int)value) * 0x077CB531u) >> 27)); // Multi-cast mitigates redundant conv.u8 - } -#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs index 69bf2f5f308fc0..909bd0fb985004 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Ascii.Utility.cs @@ -4,7 +4,7 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -#if !MICROSOFT_BCL_MEMORY +#if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.Wasm; @@ -111,7 +111,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. -#if !MICROSOFT_BCL_MEMORY +#if NET if (!Vector512.IsHardwareAccelerated && !Vector256.IsHardwareAccelerated && (Sse2.IsSupported || AdvSimd.IsSupported)) @@ -139,7 +139,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu // Note use of SBYTE instead of BYTE below; we're using the two's-complement // representation of negative integers to act as a surrogate for "is ASCII?". -#if !MICROSOFT_BCL_MEMORY +#if NET if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) { if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0) @@ -345,7 +345,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu goto Finish; } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool ContainsNonAsciiByte_Sse2(uint sseMask) { @@ -731,7 +731,7 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bu // like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. -#if !MICROSOFT_BCL_MEMORY +#if NET if (!Vector512.IsHardwareAccelerated && !Vector256.IsHardwareAccelerated && (Sse2.IsSupported || AdvSimd.IsSupported)) @@ -757,7 +757,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char)); #endif -#if !MICROSOFT_BCL_MEMORY +#if NET // Before we drain off char-by-char, try a generic vectorized loop. // Only run the loop if we have at least two vectors we can pull out. if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512.Count) @@ -950,7 +950,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu goto Finish; } -#if !MICROSOFT_BCL_MEMORY +#if NET private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuffer, nuint bufferLength /* in chars */) { // This method contains logic optimized using vector instructions for both x64 and Arm64. @@ -1266,7 +1266,7 @@ private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputB { Debug.Assert(AllCharsInUInt64AreAscii(value)); -#if !MICROSOFT_BCL_MEMORY +#if NET if (Sse2.X64.IsSupported) { // Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes @@ -1346,7 +1346,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0; ulong utf16Data64Bits = 0; -#if !MICROSOFT_BCL_MEMORY +#if NET if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector128.Count) { // Since there's overhead to setting up the vectorized code path, we only want to @@ -1519,7 +1519,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii goto Finish; } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool VectorContainsNonAsciiChar(Vector128 asciiVector) { @@ -2069,7 +2069,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B // Intrinsified in mono interpreter nuint currentOffset = 0; -#if !MICROSOFT_BCL_MEMORY +#if NET if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128.Count) { if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512.Count) @@ -2176,11 +2176,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B { while ((asciiData & 0x80000000) == 0) { -#if !MICROSOFT_BCL_MEMORY asciiData = BitOperations.RotateLeft(asciiData, 8); -#else - asciiData = (asciiData << 8) | (asciiData >> (32 - 8)); ; -#endif pUtf16Buffer[currentOffset] = (char)(byte)asciiData; currentOffset++; } @@ -2189,7 +2185,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B goto Finish; } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] private static unsafe void WidenAsciiToUtf1_Vector(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount) where TVectorByte : unmanaged, ISimdVector @@ -2278,7 +2274,7 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB { Debug.Assert(AllBytesInUInt32AreAscii(value)); -#if !MICROSOFT_BCL_MEMORY +#if NET if (AdvSimd.Arm64.IsSupported) { Vector128 vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte(); diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs index 7a79a3a6592657..992a9f6bdcbab1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs @@ -3,8 +3,7 @@ using System.Diagnostics; using System.Runtime.CompilerServices; - -#if SYSTEM_PRIVATE_CORELIB +#if NET using System.Runtime.Intrinsics; #endif diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs index 0a89968503a93d..a7044fa03e8155 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -203,7 +203,7 @@ public static unsafe OperationStatus ToUtf16(ReadOnlySpan source, Span source, Span source, Span destination, out int bytesRead, out int charsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { // NOTE: Changes to this method should be kept in sync with ToUtf16 above. @@ -825,7 +825,7 @@ private bool Fail() public static bool IsValid(ReadOnlySpan value) => Utf8Utility.GetIndexOfFirstInvalidUtf8Sequence(value, out _) < 0; -#if MICROSOFT_BCL_MEMORY +#if !NET /// /// Decodes the Rune at the beginning of the provided UTF-8 source buffer. /// diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs index 25f1f130a01fc8..b615ee5edd0ad1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Helpers.cs @@ -115,11 +115,7 @@ private static uint ExtractFourUtf8BytesFromSurrogatePair(uint value) value += 0x0000_0040u; // = [ 110111yyyyxxxxxx 11011uuuuuzzzzyy ] uint tempA = BinaryPrimitives.ReverseEndianness(value & 0x003F_0700u); // = [ 00000000 00000uuu 00xxxxxx 00000000 ] -#if !MICROSOFT_BCL_MEMORY tempA = BitOperations.RotateLeft(tempA, 16); // = [ 00xxxxxx 00000000 00000000 00000uuu ] -#else - tempA = (tempA << 16) | (tempA >> (32 - 16)); ; // = [ 00xxxxxx 00000000 00000000 00000uuu ] -#endif uint tempB = (value & 0x00FCu) << 6; // = [ 00000000 00000000 00uuzzzz 00000000 ] uint tempC = (value >> 6) & 0x000F_0000u; // = [ 00000000 0000yyyy 00000000 00000000 ] diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs index e11572e43c1afa..9c4a28c83240f3 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Transcoding.cs @@ -6,7 +6,7 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -#if !MICROSOFT_BCL_MEMORY +#if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; @@ -600,11 +600,7 @@ public static OperationStatus TranscodeToUtf16(byte* pInputBuffer, int inputLeng // At this point, toCheck = [ 00000000 00000000 10zzzzzz 11110www ]. -#if !MICROSOFT_BCL_MEMORY toCheck = BitOperations.RotateRight(toCheck, 8); -#else - toCheck = (toCheck >> 8) | (toCheck << (32 - 8)); -#endif // At this point, toCheck = [ 11110www 00000000 00000000 10zzzzzz ]. @@ -884,7 +880,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt // vector is only used in those code paths, we leave it uninitialized if SSE4.1 // is not enabled. -#if !MICROSOFT_BCL_MEMORY +#if NET Vector128 nonAsciiUtf16DataMask; if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) @@ -946,7 +942,7 @@ public static OperationStatus TranscodeToUtf8(char* pInputBuffer, int inputLengt uint inputCharsRemaining = (uint)(pFinalPosWhereCanReadDWordFromInputBuffer - pInputBuffer) + 2; uint minElementsRemaining = (uint)Math.Min(inputCharsRemaining, outputBytesRemaining); -#if !MICROSOFT_BCL_MEMORY +#if NET if (Sse41.X64.IsSupported || (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian)) { // Try reading and writing 8 elements per iteration. diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index a36f64066602ec..e947b258912270 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -5,7 +5,7 @@ using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; -#if !MICROSOFT_BCL_MEMORY +#if NET using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; @@ -121,7 +121,7 @@ internal static unsafe partial class Utf8Utility // this because we pessimistically assume we'll encounter non-ASCII data at some // point in the not-too-distant future (otherwise we would've stayed entirely // within the all-ASCII vectorized code at the entry to this method). -#if !MICROSOFT_BCL_MEMORY +#if NET nuint trailingZeroCount; if (AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) { @@ -147,7 +147,7 @@ internal static unsafe partial class Utf8Utility { do { -#if !MICROSOFT_BCL_MEMORY +#if NET if (Sse2.IsSupported) { uint mask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pInputBuffer)); @@ -177,7 +177,7 @@ internal static unsafe partial class Utf8Utility continue; // need to perform a bounds check because we might be running out of data -#if !MICROSOFT_BCL_MEMORY +#if NET LoopTerminatedEarlyDueToNonAsciiData: // x86 can only be little endian, while ARM can be big or little endian // so if we reached this label we need to check both combinations are supported @@ -605,11 +605,7 @@ internal static unsafe partial class Utf8Utility // Now we want to confirm that 0x01 <= uuuuu (otherwise this is an overlong encoding) // and that uuuuu <= 0x10 (otherwise this is an out-of-range encoding). -#if !MICROSOFT_BCL_MEMORY thisDWord = BitOperations.RotateRight(thisDWord, 8); -#else - thisDWord = (thisDWord >> 8) | (thisDWord << (32 - 8)); -#endif // Now, thisDWord = [ 00010uuu 10000000 00000000 00uuzzzz ]. // The check is now a simple add / cmp / jcc combo. @@ -751,7 +747,7 @@ internal static unsafe partial class Utf8Utility return pInputBuffer; } -#if !MICROSOFT_BCL_MEMORY +#if NET [MethodImpl(MethodImplOptions.AggressiveInlining)] [CompExactlyDependsOn(typeof(AdvSimd.Arm64))] private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bitMask128) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs index ed26341a9e67f7..17a7e7d471ded1 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.cs @@ -4,7 +4,7 @@ using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -#if !MICROSOFT_BCL_MEMORY +#if NET using System.Runtime.Intrinsics; #endif @@ -253,7 +253,7 @@ internal static bool UInt64OrdinalIgnoreCaseAscii(ulong valueA, ulong valueB) return (valueA | letterMaskA) == (valueB | letterMaskB); } -#if !MICROSOFT_BCL_MEMORY +#if NET /// /// Returns true iff the Vector128 represents 16 ASCII UTF-8 characters in machine endianness. /// diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs index af179916fb8c99..eeccfc57597920 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/UnicodeUtility.cs @@ -17,9 +17,7 @@ internal static class UnicodeUtility /// public static int GetPlane(uint codePoint) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidCodePoint(codePoint); -#endif return (int)(codePoint >> 16); } @@ -29,10 +27,8 @@ public static int GetPlane(uint codePoint) /// public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, uint lowSurrogateCodePoint) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsHighSurrogateCodePoint(highSurrogateCodePoint); UnicodeDebug.AssertIsLowSurrogateCodePoint(lowSurrogateCodePoint); -#endif // This calculation comes from the Unicode specification, Table 3-5. // Need to remove the D800 marker from the high surrogate and the DC00 marker from the low surrogate, @@ -47,9 +43,7 @@ public static uint GetScalarFromUtf16SurrogatePair(uint highSurrogateCodePoint, /// public static int GetUtf16SequenceLength(uint value) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidScalar(value); -#endif value -= 0x10000; // if value < 0x10000, high byte = 0xFF; else high byte = 0x00 value += (2 << 24); // if value < 0x10000, high byte = 0x01; else high byte = 0x02 @@ -63,9 +57,7 @@ public static int GetUtf16SequenceLength(uint value) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, out char highSurrogateCodePoint, out char lowSurrogateCodePoint) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidSupplementaryPlaneScalar(value); -#endif // This calculation comes from the Unicode specification, Table 3-5. @@ -78,9 +70,7 @@ public static void GetUtf16SurrogatesFromSupplementaryPlaneScalar(uint value, ou /// public static int GetUtf8SequenceLength(uint value) { -#if !MICROSOFT_BCL_MEMORY UnicodeDebug.AssertIsValidScalar(value); -#endif // The logic below can handle all valid scalar values branchlessly. // It gives generally good performance across all inputs, and on x86 diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs index a2407957fc20a9..e6febb60ad5c02 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs @@ -96,7 +96,7 @@ static int FromHex(char c) // !! IMPORTANT !! // Don't delete this implementation, as we use it as a reference to make sure the framework's // transcoding logic is correct. -#if !MICROSOFT_BCL_MEMORY +#if NET public #else private From a1caf07c5cd8e20a8746c10faec9c03d592caac9 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 11:14:36 +0700 Subject: [PATCH 7/9] Space --- .../Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj index 67eac34a58cd01..50507bb2d97b6d 100644 --- a/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/src/Microsoft.Bcl.Memory.csproj @@ -46,7 +46,7 @@ - + From 695e3622ea6f300411721765e48240be38ff16c5 Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 11:17:24 +0700 Subject: [PATCH 8/9] Remove MICROSOFT_BCL_MEMORY from project constants Removed the `DefineConstants` property from the project file, which included the constant `MICROSOFT_BCL_MEMORY`. This change may impact conditional compilation within the project. --- .../Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj | 1 - 1 file changed, 1 deletion(-) diff --git a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj index 729dbbdd3460c4..e2be608f30e980 100644 --- a/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj +++ b/src/libraries/Microsoft.Bcl.Memory/tests/Microsoft.Bcl.Memory.Tests.csproj @@ -3,7 +3,6 @@ $(NetFrameworkMinimum);$(NetCoreAppCurrent) true - $(DefineConstants);MICROSOFT_BCL_MEMORY From 063359adc5d6eb90cee8dd69fdd25b73539df84e Mon Sep 17 00:00:00 2001 From: Alexander Radchenko Date: Sun, 12 Jan 2025 11:23:46 +0700 Subject: [PATCH 9/9] Refactor Utf8Tests for .NET compatibility improvements --- .../System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs index e6febb60ad5c02..628768a368997c 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8Tests.cs @@ -63,7 +63,7 @@ public static byte[] DecodeHex(ReadOnlySpan inputHex) { Assert.Matches(@"^([0-9a-fA-F]{2})*$", inputHex.ToString()); -#if NET5_0_OR_GREATER +#if NET return Convert.FromHexString(inputHex); #else byte[] result = new byte[inputHex.Length / 2]; @@ -813,7 +813,7 @@ private static void ToChars_Test_Core(ReadOnlySpan utf8Input, int destinat } } -#if !NETCOREAPP3_0_OR_GREATER +#if !NET internal readonly struct Rune //: IComparable, IComparable, IEquatable { private readonly uint _value;