diff --git a/src/Shared/ServerInfrastructure/StringUtilities.cs b/src/Shared/ServerInfrastructure/StringUtilities.cs index 856c3693e844..7c2dcbc6ba26 100644 --- a/src/Shared/ServerInfrastructure/StringUtilities.cs +++ b/src/Shared/ServerInfrastructure/StringUtilities.cs @@ -24,22 +24,25 @@ public static unsafe bool TryGetAsciiString(byte* input, char* output, int count Debug.Assert((long)end >= Vector256.Count); + // PERF: so the JIT can reuse the zero from a register + Vector128 zero = Vector128.Zero; + if (Sse2.IsSupported) { if (Avx2.IsSupported && input <= end - Vector256.Count) { - Vector256 zero = Vector256.Zero; + Vector256 avxZero = Vector256.Zero; do { var vector = Avx.LoadVector256(input).AsSByte(); - if (!CheckBytesInAsciiRange(vector, zero)) + if (!CheckBytesInAsciiRange(vector, avxZero)) { return false; } - var tmp0 = Avx2.UnpackLow(vector, zero); - var tmp1 = Avx2.UnpackHigh(vector, zero); + var tmp0 = Avx2.UnpackLow(vector, avxZero); + var tmp1 = Avx2.UnpackHigh(vector, avxZero); // Bring into the right order var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20); @@ -60,8 +63,6 @@ public static unsafe bool TryGetAsciiString(byte* input, char* output, int count if (input <= end - Vector128.Count) { - Vector128 zero = Vector128.Zero; - do { var vector = Sse2.LoadVector128(input).AsSByte(); @@ -122,11 +123,12 @@ out Unsafe.AsRef>(output), return false; } - if (Bmi2.X64.IsSupported) + // BMI2 could be used, but this variant is faster on both Intel and AMD. + if (Sse2.X64.IsSupported) { - // BMI2 will work regardless of the processor's endianness. - ((ulong*)output)[0] = Bmi2.X64.ParallelBitDeposit((ulong)value, 0x00FF00FF_00FF00FFul); - ((ulong*)output)[1] = Bmi2.X64.ParallelBitDeposit((ulong)(value >> 32), 0x00FF00FF_00FF00FFul); + Vector128 vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); + Sse2.Store((ulong*)output, vecWide); } else { @@ -152,19 +154,7 @@ out Unsafe.AsRef>(output), return false; } - if (Bmi2.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - ((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); - ((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); - } - else - { - output[0] = (char)input[0]; - output[1] = (char)input[1]; - output[2] = (char)input[2]; - output[3] = (char)input[3]; - } + WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); @@ -181,19 +171,7 @@ out Unsafe.AsRef>(output), return false; } - if (Bmi2.IsSupported) - { - // BMI2 will work regardless of the processor's endianness. - ((uint*)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu); - ((uint*)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu); - } - else - { - output[0] = (char)input[0]; - output[1] = (char)input[1]; - output[2] = (char)input[2]; - output[3] = (char)input[3]; - } + WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero); input += sizeof(int); output += sizeof(int); @@ -378,6 +356,25 @@ ref Unsafe.Add(ref str, offset), return false; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static unsafe void WidenFourAsciiBytesToUtf16AndWriteToBuffer(char* output, byte* input, int value, Vector128 zero) + { + // BMI2 could be used, but this variant is faster on both Intel and AMD. + if (Sse2.X64.IsSupported) + { + Vector128 vecNarrow = Sse2.ConvertScalarToVector128Int32(value).AsSByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, zero).AsUInt64(); + Unsafe.WriteUnaligned(output, Sse2.X64.ConvertToUInt64(vecWide)); + } + else + { + output[0] = (char)input[0]; + output[1] = (char)input[1]; + output[2] = (char)input[2]; + output[3] = (char)input[3]; + } + } + /// /// Given a DWORD which represents a buffer of 4 bytes, widens the buffer into 4 WORDs and /// compares them to the WORD buffer with machine endianness. @@ -390,11 +387,13 @@ private static bool WidenFourAsciiBytesToUtf16AndCompareToChars(ref char charSta return false; } - if (Bmi2.X64.IsSupported) + // BMI2 could be used, but this variant is faster on both Intel and AMD. + if (Sse2.X64.IsSupported) { - // BMI2 will work regardless of the processor's endianness. + Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt64(); return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == - Bmi2.X64.ParallelBitDeposit(value, 0x00FF00FF_00FF00FFul); + Sse2.X64.ConvertToUInt64(vecWide); } else { @@ -427,11 +426,13 @@ private static bool WidenTwoAsciiBytesToUtf16AndCompareToChars(ref char charStar return false; } - if (Bmi2.IsSupported) + // BMI2 could be used, but this variant is faster on both Intel and AMD. + if (Sse2.IsSupported) { - // BMI2 will work regardless of the processor's endianness. + Vector128 vecNarrow = Sse2.ConvertScalarToVector128UInt32(value).AsByte(); + Vector128 vecWide = Sse2.UnpackLow(vecNarrow, Vector128.Zero).AsUInt32(); return Unsafe.ReadUnaligned(ref Unsafe.As(ref charStart)) == - Bmi2.ParallelBitDeposit(value, 0x00FF00FFu); + Sse2.ConvertToUInt32(vecWide); } else { @@ -560,12 +561,14 @@ private static bool CheckBytesInAsciiRange(long check) return (((check - 0x0101010101010101L) | check) & HighBits) == 0; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool CheckBytesInAsciiRange(int check) { const int HighBits = unchecked((int)0x80808080); return (((check - 0x01010101) | check) & HighBits) == 0; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static bool CheckBytesInAsciiRange(short check) { const short HighBits = unchecked((short)0x8080);