Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Includes type forwarding for System.Text.Unicode.Utf8 to the Microsoft.Bcl.Memory library #111292

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Prev Previous commit
Next Next commit
Enhance UTF-8 and Unicode support in BCL Memory
Updated `PackageDescription` to include "Utf8" support.

Added new `ItemGroup` for conditional compilation of UTF-8
and Unicode handling files for non-net8.0 frameworks.

Modified visibility and implementations in `Ascii.Utility.Helpers.cs`,
`Utf8.cs`, and `Utf8Utility` based on `MICROSOFT_BCL_MEMORY` define.
  • Loading branch information
AlexRadch committed Jan 11, 2025
commit f60df7c2558efb4217cfbdd4cd6f56ff97da47f6
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
<DefineConstants>$(DefineConstants);MICROSOFT_BCL_MEMORY</DefineConstants>
<IsPackable>true</IsPackable>
<PackageDescription>Provides Base64Url, Index and Range types support for .NET Framework and .NET Standard.</PackageDescription>
<PackageDescription>Provides Base64Url, Utf8, Index and Range types support for .NET Framework and .NET Standard.</PackageDescription>
</PropertyGroup>

<!-- DesignTimeBuild requires all the TargetFramework Derived Properties to not be present in the first property group. -->
Expand All @@ -27,6 +27,18 @@
<Compile Include="$(CoreLibSharedDir)System\Buffers\Text\Base64Url\Base64UrlValidator.cs" Link="System\Buffers\Text\Base64Url\Base64UrlValidator.cs" />
</ItemGroup>

<ItemGroup Condition="!$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'net8.0'))">
AlexRadch marked this conversation as resolved.
Show resolved Hide resolved
<Compile Include="$(CoreLibSharedDir)System\Text\Unicode\Utf8.cs" Link="System\Text\Unicode\Utf8.cs" />
<Compile Include="$(CoreLibSharedDir)System\Text\Unicode\Utf8Utility.cs" Link="System\Text\Unicode\Utf8Utility.cs" />
<Compile Include="$(CoreLibSharedDir)System\Text\Unicode\Utf8Utility.Transcoding.cs" Link="System\Text\Unicode\Utf8Utility.Transcoding.cs" />
<Compile Include="$(CoreLibSharedDir)System\Text\Unicode\Utf8Utility.Validation.cs" Link="System\Text\Unicode\Utf8Utility.Validation.cs" />
<Compile Include="$(CoreLibSharedDir)System\Text\Unicode\Utf8Utility.Helpers.cs" Link="System\Text\Unicode\Utf8Utility.Helpers.cs" />
<Compile Include="$(CoreLibSharedDir)System\Text\Unicode\Utf16Utility.cs" Link="System\Text\Unicode\Utf16Utility.cs" />
<Compile Include="$(CoreLibSharedDir)System\Text\Ascii.Utility.cs" Link="System\Text\Ascii.Utility.cs" />
<Compile Include="$(CoreLibSharedDir)System\Text\Ascii.Utility.Helpers.cs" Link="System\Text\Ascii.Utility.Helpers.cs" />
<Compile Include="$(CoreLibSharedDir)System\Text\UnicodeUtility.cs" Link="System\Text\UnicodeUtility.cs" />
</ItemGroup>

<ItemGroup Condition="!$([MSBuild]::IsTargetFrameworkCompatible('$(TargetFramework)', 'netstandard2.1'))">
<Compile Include="$(CoreLibSharedDir)System\Index.cs" />
<Compile Include="$(CoreLibSharedDir)System\Range.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

namespace System.Text
{
public static partial class Ascii
#if !MICROSOFT_BCL_MEMORY
AlexRadch marked this conversation as resolved.
Show resolved Hide resolved
public
#else
internal
#endif
static partial class Ascii
{
/// <summary>
/// A mask which selects only the high bit of each byte of the given <see cref="uint"/>.
Expand Down Expand Up @@ -44,7 +50,11 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat

if (BitConverter.IsLittleEndian)
{
#if !MICROSOFT_BCL_MEMORY
return (uint)BitOperations.TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
#else
return (uint)TrailingZeroCount(value & UInt32HighBitsOnlyMask) >> 3;
#endif
}
else
{
Expand All @@ -60,22 +70,55 @@ internal static uint CountNumberOfLeadingAsciiBytesFromUInt32WithSomeNonAsciiDat
// expensive. Instead we'll just change how we perform the shifts.

// Read first byte
#if !MICROSOFT_BCL_MEMORY
value = BitOperations.RotateLeft(value, 1);
#else
value = (value << 1) | (value >> (32 - 1));
#endif
AlexRadch marked this conversation as resolved.
Show resolved Hide resolved
uint allBytesUpToNowAreAscii = value & 1;
uint numAsciiBytes = allBytesUpToNowAreAscii;

// Read second byte
#if !MICROSOFT_BCL_MEMORY
value = BitOperations.RotateLeft(value, 8);
#else
value = (value << 8) | (value >> (32 - 8));
#endif
allBytesUpToNowAreAscii &= value;
numAsciiBytes += allBytesUpToNowAreAscii;

// Read third byte
#if !MICROSOFT_BCL_MEMORY
value = BitOperations.RotateLeft(value, 8);
#else
value = (value << 8) | (value >> (32 - 8));
#endif
allBytesUpToNowAreAscii &= value;
numAsciiBytes += allBytesUpToNowAreAscii;

return numAsciiBytes;
}
}

#if MICROSOFT_BCL_MEMORY
private static ReadOnlySpan<byte> TrailingZeroCountDeBruijn => // 32
[
00, 01, 28, 02, 29, 14, 24, 03,
30, 22, 20, 15, 25, 17, 04, 08,
31, 27, 13, 23, 21, 19, 16, 07,
26, 12, 18, 06, 11, 05, 10, 09
];

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int TrailingZeroCount(uint value)
{
// uint.MaxValue >> 27 is always in range [0 - 31] so we use Unsafe.AddByteOffset to avoid bounds check
return Unsafe.AddByteOffset(
// Using deBruijn sequence, k=2, n=5 (2^5=32) : 0b_0000_0111_0111_1100_1011_0101_0011_0001u
ref MemoryMarshal.GetReference(TrailingZeroCountDeBruijn),
// uint|long -> IntPtr cast on 32-bit platforms does expensive overflow checks not needed here
(IntPtr)(int)(((value & (uint)-(int)value) * 0x077CB531u) >> 27)); // Multi-cast mitigates redundant conv.u8
}
#endif
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add this to the polyfill?

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,21 @@
using System.Diagnostics;
using System.Numerics;
using System.Runtime.CompilerServices;
#if !MICROSOFT_BCL_MEMORY
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.Wasm;
using System.Runtime.Intrinsics.X86;
#endif

namespace System.Text
{
public static partial class Ascii
#if !MICROSOFT_BCL_MEMORY
AlexRadch marked this conversation as resolved.
Show resolved Hide resolved
public
#else
internal
#endif
static partial class Ascii
{
/// <summary>
/// Returns <see langword="true"/> iff all bytes in <paramref name="value"/> are ASCII.
Expand Down Expand Up @@ -53,6 +60,7 @@ private static bool AllCharsInUInt64AreAscii<T>(ulong value)
: AllCharsInUInt64AreAscii(value);
}

#if !MICROSOFT_BCL_MEMORY
AlexRadch marked this conversation as resolved.
Show resolved Hide resolved
[MethodImpl(MethodImplOptions.AggressiveInlining)]
[CompExactlyDependsOn(typeof(AdvSimd.Arm64))]
private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128<byte> value, Vector128<byte> bitmask)
Expand All @@ -75,6 +83,7 @@ private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128<byte> val
Debug.Assert((mask != 0) ? index < 16 : index >= 16);
return index;
}
#endif

/// <summary>
/// Given a DWORD which represents two packed chars in machine-endian order,
Expand Down Expand Up @@ -102,14 +111,17 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint bu
// like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
// this method is running.

#if !MICROSOFT_BCL_MEMORY
if (!Vector512.IsHardwareAccelerated &&
!Vector256.IsHardwareAccelerated &&
(Sse2.IsSupported || AdvSimd.IsSupported))
{
return GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength);
}
else
#endif
{

AlexRadch marked this conversation as resolved.
Show resolved Hide resolved
// Handles Vector512, Vector256, Vector128, and scalar.
return GetIndexOfFirstNonAsciiByte_Vector(pBuffer, bufferLength);
}
Expand All @@ -128,6 +140,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
// Note use of SBYTE instead of BYTE below; we're using the two's-complement
// representation of negative integers to act as a surrogate for "is ASCII?".

#if !MICROSOFT_BCL_MEMORY
if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512<byte>.Count)
{
if (Vector512.Load(pBuffer).ExtractMostSignificantBits() == 0)
Expand Down Expand Up @@ -236,6 +249,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
bufferLength += (nuint)pOriginalBuffer;
}
}
#endif

// At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
// a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
Expand Down Expand Up @@ -332,6 +346,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Vector(byte* pBuffer, nu
goto Finish;
}

#if !MICROSOFT_BCL_MEMORY
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool ContainsNonAsciiByte_Sse2(uint sseMask)
{
Expand Down Expand Up @@ -702,6 +717,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuff

goto Finish;
}
#endif

/// <summary>
/// Returns the index in <paramref name="pBuffer"/> where the first non-ASCII char is found.
Expand All @@ -716,13 +732,15 @@ internal static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint bu
// like pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
// this method is running.

#if !MICROSOFT_BCL_MEMORY
if (!Vector512.IsHardwareAccelerated &&
!Vector256.IsHardwareAccelerated &&
(Sse2.IsSupported || AdvSimd.IsSupported))
{
return GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength);
}
else
#endif
{
// Handles Vector512, Vector256, Vector128, and scalar.
return GetIndexOfFirstNonAsciiChar_Vector(pBuffer, bufferLength);
Expand All @@ -740,6 +758,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
Debug.Assert(bufferLength <= nuint.MaxValue / sizeof(char));
#endif

#if !MICROSOFT_BCL_MEMORY
// Before we drain off char-by-char, try a generic vectorized loop.
// Only run the loop if we have at least two vectors we can pull out.
if (Vector512.IsHardwareAccelerated && bufferLength >= 2 * (uint)Vector512<ushort>.Count)
Expand Down Expand Up @@ -849,7 +868,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
bufferLength -= ((nuint)pBuffer - (nuint)pOriginalBuffer) / sizeof(char);
}
}

#endif

// At this point, the buffer length wasn't enough to perform a vectorized search, or we did perform
// a vectorized search and encountered non-ASCII data. In either case go down a non-vectorized code
Expand Down Expand Up @@ -932,6 +951,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Vector(char* pBuffer, nu
goto Finish;
}

#if !MICROSOFT_BCL_MEMORY
private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuffer, nuint bufferLength /* in chars */)
{
// This method contains logic optimized using vector instructions for both x64 and Arm64.
Expand Down Expand Up @@ -1235,6 +1255,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuff

goto Finish;
}
#endif

/// <summary>
/// Given a QWORD which represents a buffer of 4 ASCII chars in machine-endian order,
Expand All @@ -1246,6 +1267,7 @@ private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputB
{
Debug.Assert(AllCharsInUInt64AreAscii(value));

#if !MICROSOFT_BCL_MEMORY
if (Sse2.X64.IsSupported)
{
// Narrows a vector of words [ w0 w1 w2 w3 ] to a vector of bytes
Expand All @@ -1264,8 +1286,8 @@ private static void NarrowFourUtf16CharsToAsciiAndWriteToBuffer(ref byte outputB
Vector64<byte> lower = AdvSimd.ExtractNarrowingSaturateUnsignedLower(vecWide);
Unsafe.WriteUnaligned(ref outputBuffer, lower.AsUInt32().ToScalar());
}

else
#endif
{
if (BitConverter.IsLittleEndian)
{
Expand Down Expand Up @@ -1325,6 +1347,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii
uint utf16Data32BitsHigh = 0, utf16Data32BitsLow = 0;
ulong utf16Data64Bits = 0;

#if !MICROSOFT_BCL_MEMORY
if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= 2 * (uint)Vector128<byte>.Count)
{
// Since there's overhead to setting up the vectorized code path, we only want to
Expand Down Expand Up @@ -1361,6 +1384,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii
currentOffset = NarrowUtf16ToAscii_Intrinsified(pUtf16Buffer, pAsciiBuffer, elementCount);
}
}
#endif

Debug.Assert(currentOffset <= elementCount);
nuint remainingElementCount = elementCount - currentOffset;
Expand Down Expand Up @@ -1496,6 +1520,7 @@ internal static unsafe nuint NarrowUtf16ToAscii(char* pUtf16Buffer, byte* pAscii
goto Finish;
}

#if !MICROSOFT_BCL_MEMORY
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool VectorContainsNonAsciiChar(Vector128<byte> asciiVector)
{
Expand Down Expand Up @@ -2032,6 +2057,7 @@ private static unsafe nuint NarrowUtf16ToAscii_Intrinsified_512(char* pUtf16Buff

goto Finish;
}
#endif

/// <summary>
/// Copies as many ASCII bytes (00..7F) as possible from <paramref name="pAsciiBuffer"/>
Expand All @@ -2044,6 +2070,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
// Intrinsified in mono interpreter
nuint currentOffset = 0;

#if !MICROSOFT_BCL_MEMORY
if (BitConverter.IsLittleEndian && Vector128.IsHardwareAccelerated && elementCount >= (uint)Vector128<byte>.Count)
{
if (Vector512.IsHardwareAccelerated && (elementCount - currentOffset) >= (uint)Vector512<byte>.Count)
Expand All @@ -2059,6 +2086,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
WidenAsciiToUtf1_Vector<Vector128<byte>, Vector128<ushort>>(pAsciiBuffer, pUtf16Buffer, ref currentOffset, elementCount);
}
}
#endif

Debug.Assert(currentOffset <= elementCount);
nuint remainingElementCount = elementCount - currentOffset;
Expand Down Expand Up @@ -2149,7 +2177,11 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
{
while ((asciiData & 0x80000000) == 0)
{
#if !MICROSOFT_BCL_MEMORY
asciiData = BitOperations.RotateLeft(asciiData, 8);
#else
asciiData = (asciiData << 8) | (asciiData >> (32 - 8)); ;
#endif
pUtf16Buffer[currentOffset] = (char)(byte)asciiData;
currentOffset++;
}
Expand All @@ -2158,6 +2190,7 @@ internal static unsafe nuint WidenAsciiToUtf16(byte* pAsciiBuffer, char* pUtf16B
goto Finish;
}

#if !MICROSOFT_BCL_MEMORY
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static unsafe void WidenAsciiToUtf1_Vector<TVectorByte, TVectorUInt16>(byte* pAsciiBuffer, char* pUtf16Buffer, ref nuint currentOffset, nuint elementCount)
where TVectorByte : unmanaged, ISimdVector<TVectorByte, byte>
Expand Down Expand Up @@ -2235,7 +2268,7 @@ private static (TVectorUInt16 Lower, TVectorUInt16 Upper) Widen<TVectorByte, TVe
return ((TVectorUInt16)(object)Lower128, (TVectorUInt16)(object)Upper128);
}
}

#endif

/// <summary>
/// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and
Expand All @@ -2246,6 +2279,7 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB
{
Debug.Assert(AllBytesInUInt32AreAscii(value));

#if !MICROSOFT_BCL_MEMORY
if (AdvSimd.Arm64.IsSupported)
{
Vector128<byte> vecNarrow = AdvSimd.DuplicateToVector128(value).AsByte();
Expand All @@ -2259,6 +2293,7 @@ internal static void WidenFourAsciiBytesToUtf16AndWriteToBuffer(ref char outputB
Unsafe.WriteUnaligned(ref Unsafe.As<char, byte>(ref outputBuffer), vecWide.ToScalar());
}
else
#endif
{
if (BitConverter.IsLittleEndian)
{
Expand Down
Loading
Loading